add tgi2.4.0

81a882ad · jixx · 9822d7f6 · 81a882ad · 81a882ad · 81a882ad
Commit 81a882ad authored Nov 21, 2024 by jixx
20 changed files
--- a/integration-tests/models/__snapshots__/test_llava_next/test_flash_llava_next_load.json
+++ b/integration-tests/models/__snapshots__/test_llava_next/test_flash_llava_next_load.json
--- a/integration-tests/models/__snapshots__/test_lora_mistral/test_lora_mistral_with_customer_support_adapter.json
+++ b/integration-tests/models/__snapshots__/test_lora_mistral/test_lora_mistral_with_customer_support_adapter.json
+{
+  "details": {
+    "finish_reason": "length",
+    "generated_tokens": 40,
+    "prefill": [],
+    "seed": null,
+    "tokens": [
+      {
+        "id": 13,
+        "logprob": -0.27416992,
+        "special": false,
+        "text": "\n"
+      },
+      {
+        "id": 13,
+        "logprob": -0.17016602,
+        "special": false,
+        "text": "\n"
+      },
+      {
+        "id": 28737,
+        "logprob": -2.7109375,
+        "special": false,
+        "text": "I"
+      },
+      {
+        "id": 28809,
+        "logprob": -1.5,
+        "special": false,
+        "text": "’"
+      },
+      {
+        "id": 28719,
+        "logprob": -0.34204102,
+        "special": false,
+        "text": "m"
+      },
+      {
+        "id": 459,
+        "logprob": -1.6914062,
+        "special": false,
+        "text": " not"
+      },
+      {
+        "id": 1864,
+        "logprob": -0.69140625,
+        "special": false,
+        "text": " sure"
+      },
+      {
+        "id": 513,
+        "logprob": -1.6171875,
+        "special": false,
+        "text": " if"
+      },
+      {
+        "id": 315,
+        "logprob": -1.3837891,
+        "special": false,
+        "text": " I"
+      },
+      {
+        "id": 541,
+        "logprob": -1.2226562,
+        "special": false,
+        "text": " can"
+      },
+      {
+        "id": 1567,
+        "logprob": -1.8652344,
+        "special": false,
+        "text": " come"
+      },
+      {
+        "id": 582,
+        "logprob": -0.0070228577,
+        "special": false,
+        "text": " up"
+      },
+      {
+        "id": 395,
+        "logprob": -0.0054092407,
+        "special": false,
+        "text": " with"
+      },
+      {
+        "id": 28705,
+        "logprob": -0.62597656,
+        "special": false,
+        "text": " "
+      },
+      {
+        "id": 28770,
+        "logprob": -0.0035572052,
+        "special": false,
+        "text": "3"
+      },
+      {
+        "id": 4842,
+        "logprob": -0.93603516,
+        "special": false,
+        "text": " unique"
+      },
+      {
+        "id": 3085,
+        "logprob": -0.028411865,
+        "special": false,
+        "text": " words"
+      },
+      {
+        "id": 369,
+        "logprob": -1.0400391,
+        "special": false,
+        "text": " that"
+      },
+      {
+        "id": 6685,
+        "logprob": -0.09710693,
+        "special": false,
+        "text": " describe"
+      },
+      {
+        "id": 528,
+        "logprob": -0.066467285,
+        "special": false,
+        "text": " me"
+      },
+      {
+        "id": 28725,
+        "logprob": -1.0722656,
+        "special": false,
+        "text": ","
+      },
+      {
+        "id": 562,
+        "logprob": -0.33422852,
+        "special": false,
+        "text": " but"
+      },
+      {
+        "id": 315,
+        "logprob": -0.5136719,
+        "special": false,
+        "text": " I"
+      },
+      {
+        "id": 28809,
+        "logprob": -0.8989258,
+        "special": false,
+        "text": "’"
+      },
+      {
+        "id": 584,
+        "logprob": -0.2076416,
+        "special": false,
+        "text": "ll"
+      },
+      {
+        "id": 1464,
+        "logprob": -0.8808594,
+        "special": false,
+        "text": " try"
+      },
+      {
+        "id": 28723,
+        "logprob": -0.88427734,
+        "special": false,
+        "text": "."
+      },
+      {
+        "id": 13,
+        "logprob": -0.91064453,
+        "special": false,
+        "text": "\n"
+      },
+      {
+        "id": 13,
+        "logprob": -0.08105469,
+        "special": false,
+        "text": "\n"
+      },
+      {
+        "id": 28740,
+        "logprob": -1.8486328,
+        "special": false,
+        "text": "1"
+      },
+      {
+        "id": 28723,
+        "logprob": -0.111572266,
+        "special": false,
+        "text": "."
+      },
+      {
+        "id": 23626,
+        "logprob": -3.15625,
+        "special": false,
+        "text": " Creative"
+      },
+      {
+        "id": 13,
+        "logprob": -0.9194336,
+        "special": false,
+        "text": "\n"
+      },
+      {
+        "id": 28750,
+        "logprob": -0.24841309,
+        "special": false,
+        "text": "2"
+      },
+      {
+        "id": 28723,
+        "logprob": -9.393692e-05,
+        "special": false,
+        "text": "."
+      },
+      {
+        "id": 6785,
+        "logprob": -3.1386719,
+        "special": false,
+        "text": " Fun"
+      },
+      {
+        "id": 1780,
+        "logprob": -0.53564453,
+        "special": false,
+        "text": "ny"
+      },
+      {
+        "id": 13,
+        "logprob": -0.09033203,
+        "special": false,
+        "text": "\n"
+      },
+      {
+        "id": 28770,
+        "logprob": -0.00466156,
+        "special": false,
+        "text": "3"
+      },
+      {
+        "id": 28723,
+        "logprob": -0.00016450882,
+        "special": false,
+        "text": "."
+      }
+    ]
+  },
+  "generated_text": "\n\nI’m not sure if I can come up with 3 unique words that describe me, but I’ll try.\n\n1. Creative\n2. Funny\n3."
+}
--- a/integration-tests/models/__snapshots__/test_lora_mistral/test_lora_mistral_with_dbpedia_adapter.json
+++ b/integration-tests/models/__snapshots__/test_lora_mistral/test_lora_mistral_with_dbpedia_adapter.json
+{
+  "details": {
+    "finish_reason": "eos_token",
+    "generated_tokens": 7,
+    "prefill": [],
+    "seed": null,
+    "tokens": [
+      {
+        "id": 1,
+        "logprob": -0.49658203,
+        "special": true,
+        "text": "<s>"
+      },
+      {
+        "id": 28705,
+        "logprob": -0.0016384125,
+        "special": false,
+        "text": " "
+      },
+      {
+        "id": 1,
+        "logprob": -1.4931641,
+        "special": true,
+        "text": "<s>"
+      },
+      {
+        "id": 28705,
+        "logprob": -0.00075769424,
+        "special": false,
+        "text": " "
+      },
+      {
+        "id": 28740,
+        "logprob": -0.25024414,
+        "special": false,
+        "text": "1"
+      },
+      {
+        "id": 28740,
+        "logprob": -0.2631836,
+        "special": false,
+        "text": "1"
+      },
+      {
+        "id": 2,
+        "logprob": -0.0003285408,
+        "special": true,
+        "text": "</s>"
+      }
+    ]
+  },
+  "generated_text": "  11"
+}
--- a/integration-tests/models/__snapshots__/test_lora_mistral/test_lora_mistral_without_adapter.json
+++ b/integration-tests/models/__snapshots__/test_lora_mistral/test_lora_mistral_without_adapter.json
+{
+  "details": {
+    "finish_reason": "length",
+    "generated_tokens": 40,
+    "prefill": [],
+    "seed": null,
+    "tokens": [
+      {
+        "id": 13,
+        "logprob": -1.0488281,
+        "special": false,
+        "text": "\n"
+      },
+      {
+        "id": 13,
+        "logprob": -1.0800781,
+        "special": false,
+        "text": "\n"
+      },
+      {
+        "id": 27332,
+        "logprob": -2.1152344,
+        "special": false,
+        "text": "###"
+      },
+      {
+        "id": 28705,
+        "logprob": -1.6748047,
+        "special": false,
+        "text": " "
+      },
+      {
+        "id": 28740,
+        "logprob": -0.097229004,
+        "special": false,
+        "text": "1"
+      },
+      {
+        "id": 28723,
+        "logprob": -0.16467285,
+        "special": false,
+        "text": "."
+      },
+      {
+        "id": 7615,
+        "logprob": -2.2246094,
+        "special": false,
+        "text": " News"
+      },
+      {
+        "id": 13,
+        "logprob": -1.0488281,
+        "special": false,
+        "text": "\n"
+      },
+      {
+        "id": 27332,
+        "logprob": -0.69189453,
+        "special": false,
+        "text": "###"
+      },
+      {
+        "id": 28705,
+        "logprob": -0.013343811,
+        "special": false,
+        "text": " "
+      },
+      {
+        "id": 28750,
+        "logprob": -0.011230469,
+        "special": false,
+        "text": "2"
+      },
+      {
+        "id": 28723,
+        "logprob": -0.00096845627,
+        "special": false,
+        "text": "."
+      },
+      {
+        "id": 21095,
+        "logprob": -2.5605469,
+        "special": false,
+        "text": " Blog"
+      },
+      {
+        "id": 13,
+        "logprob": -0.19458008,
+        "special": false,
+        "text": "\n"
+      },
+      {
+        "id": 27332,
+        "logprob": -0.031280518,
+        "special": false,
+        "text": "###"
+      },
+      {
+        "id": 28705,
+        "logprob": -0.0030708313,
+        "special": false,
+        "text": " "
+      },
+      {
+        "id": 28770,
+        "logprob": -0.0029277802,
+        "special": false,
+        "text": "3"
+      },
+      {
+        "id": 28723,
+        "logprob": -0.0012350082,
+        "special": false,
+        "text": "."
+      },
+      {
+        "id": 20108,
+        "logprob": -2.1582031,
+        "special": false,
+        "text": " Article"
+      },
+      {
+        "id": 13,
+        "logprob": -0.05810547,
+        "special": false,
+        "text": "\n"
+      },
+      {
+        "id": 27332,
+        "logprob": -0.35083008,
+        "special": false,
+        "text": "###"
+      },
+      {
+        "id": 28705,
+        "logprob": -0.034332275,
+        "special": false,
+        "text": " "
+      },
+      {
+        "id": 28781,
+        "logprob": -0.009666443,
+        "special": false,
+        "text": "4"
+      },
+      {
+        "id": 28723,
+        "logprob": -0.0013113022,
+        "special": false,
+        "text": "."
+      },
+      {
+        "id": 8349,
+        "logprob": -2.6191406,
+        "special": false,
+        "text": " Review"
+      },
+      {
+        "id": 13,
+        "logprob": -0.04031372,
+        "special": false,
+        "text": "\n"
+      },
+      {
+        "id": 27332,
+        "logprob": -0.45239258,
+        "special": false,
+        "text": "###"
+      },
+      {
+        "id": 28705,
+        "logprob": -0.045410156,
+        "special": false,
+        "text": " "
+      },
+      {
+        "id": 28782,
+        "logprob": -0.0041236877,
+        "special": false,
+        "text": "5"
+      },
+      {
+        "id": 28723,
+        "logprob": -0.0010223389,
+        "special": false,
+        "text": "."
+      },
+      {
+        "id": 5299,
+        "logprob": -2.8066406,
+        "special": false,
+        "text": " Other"
+      },
+      {
+        "id": 13,
+        "logprob": -0.12054443,
+        "special": false,
+        "text": "\n"
+      },
+      {
+        "id": 13,
+        "logprob": -0.44580078,
+        "special": false,
+        "text": "\n"
+      },
+      {
+        "id": 13,
+        "logprob": -1.4921875,
+        "special": false,
+        "text": "\n"
+      },
+      {
+        "id": 13,
+        "logprob": -1.3574219,
+        "special": false,
+        "text": "\n"
+      },
+      {
+        "id": 13,
+        "logprob": -1.0039062,
+        "special": false,
+        "text": "\n"
+      },
+      {
+        "id": 13,
+        "logprob": -0.5859375,
+        "special": false,
+        "text": "\n"
+      },
+      {
+        "id": 13,
+        "logprob": -0.43481445,
+        "special": false,
+        "text": "\n"
+      },
+      {
+        "id": 13,
+        "logprob": -0.2783203,
+        "special": false,
+        "text": "\n"
+      },
+      {
+        "id": 13,
+        "logprob": -0.20410156,
+        "special": false,
+        "text": "\n"
+      }
+    ]
+  },
+  "generated_text": "\n\n### 1. News\n### 2. Blog\n### 3. Article\n### 4. Review\n### 5. Other\n\n\n\n\n\n\n\n\n"
+}
--- a/integration-tests/models/__snapshots__/test_lora_mistral/test_lora_mistral_without_customer_support_adapter.json
+++ b/integration-tests/models/__snapshots__/test_lora_mistral/test_lora_mistral_without_customer_support_adapter.json
+{
+  "details": {
+    "finish_reason": "length",
+    "generated_tokens": 40,
+    "prefill": [],
+    "seed": null,
+    "tokens": [
+      {
+        "id": 13,
+        "logprob": -0.31347656,
+        "special": false,
+        "text": "\n"
+      },
+      {
+        "id": 13,
+        "logprob": -0.27441406,
+        "special": false,
+        "text": "\n"
+      },
+      {
+        "id": 28737,
+        "logprob": -2.2285156,
+        "special": false,
+        "text": "I"
+      },
+      {
+        "id": 28809,
+        "logprob": -1.4677734,
+        "special": false,
+        "text": "’"
+      },
+      {
+        "id": 28719,
+        "logprob": -0.31762695,
+        "special": false,
+        "text": "m"
+      },
+      {
+        "id": 264,
+        "logprob": -1.6865234,
+        "special": false,
+        "text": " a"
+      },
+      {
+        "id": 1215,
+        "logprob": -3.2695312,
+        "special": false,
+        "text": " very"
+      },
+      {
+        "id": 20640,
+        "logprob": -3.1230469,
+        "special": false,
+        "text": " passionate"
+      },
+      {
+        "id": 1338,
+        "logprob": -0.48339844,
+        "special": false,
+        "text": " person"
+      },
+      {
+        "id": 28723,
+        "logprob": -0.9970703,
+        "special": false,
+        "text": "."
+      },
+      {
+        "id": 315,
+        "logprob": -0.5498047,
+        "special": false,
+        "text": " I"
+      },
+      {
+        "id": 28809,
+        "logprob": -1.1923828,
+        "special": false,
+        "text": "’"
+      },
+      {
+        "id": 28719,
+        "logprob": -0.080444336,
+        "special": false,
+        "text": "m"
+      },
+      {
+        "id": 1215,
+        "logprob": -1.8271484,
+        "special": false,
+        "text": " very"
+      },
+      {
+        "id": 12215,
+        "logprob": -2.8847656,
+        "special": false,
+        "text": " driven"
+      },
+      {
+        "id": 28723,
+        "logprob": -1.0927734,
+        "special": false,
+        "text": "."
+      },
+      {
+        "id": 315,
+        "logprob": -0.4584961,
+        "special": false,
+        "text": " I"
+      },
+      {
+        "id": 28809,
+        "logprob": -0.5019531,
+        "special": false,
+        "text": "’"
+      },
+      {
+        "id": 28719,
+        "logprob": -0.030715942,
+        "special": false,
+        "text": "m"
+      },
+      {
+        "id": 1215,
+        "logprob": -0.96972656,
+        "special": false,
+        "text": " very"
+      },
+      {
+        "id": 7798,
+        "logprob": -2.8847656,
+        "special": false,
+        "text": " determined"
+      },
+      {
+        "id": 28723,
+        "logprob": -0.27319336,
+        "special": false,
+        "text": "."
+      },
+      {
+        "id": 13,
+        "logprob": -0.56396484,
+        "special": false,
+        "text": "\n"
+      },
+      {
+        "id": 13,
+        "logprob": -0.011016846,
+        "special": false,
+        "text": "\n"
+      },
+      {
+        "id": 3195,
+        "logprob": -0.7163086,
+        "special": false,
+        "text": "What"
+      },
+      {
+        "id": 349,
+        "logprob": -1.1611328,
+        "special": false,
+        "text": " is"
+      },
+      {
+        "id": 574,
+        "logprob": -0.515625,
+        "special": false,
+        "text": " your"
+      },
+      {
+        "id": 6656,
+        "logprob": -1.0253906,
+        "special": false,
+        "text": " favorite"
+      },
+      {
+        "id": 1970,
+        "logprob": -2.1738281,
+        "special": false,
+        "text": " thing"
+      },
+      {
+        "id": 684,
+        "logprob": -0.48364258,
+        "special": false,
+        "text": " about"
+      },
+      {
+        "id": 1250,
+        "logprob": -1.8876953,
+        "special": false,
+        "text": " being"
+      },
+      {
+        "id": 264,
+        "logprob": -0.41967773,
+        "special": false,
+        "text": " a"
+      },
+      {
+        "id": 8626,
+        "logprob": -2.9160156,
+        "special": false,
+        "text": " teacher"
+      },
+      {
+        "id": 28804,
+        "logprob": -0.11920166,
+        "special": false,
+        "text": "?"
+      },
+      {
+        "id": 13,
+        "logprob": -0.023727417,
+        "special": false,
+        "text": "\n"
+      },
+      {
+        "id": 13,
+        "logprob": -0.010848999,
+        "special": false,
+        "text": "\n"
+      },
+      {
+        "id": 28737,
+        "logprob": -1.0566406,
+        "special": false,
+        "text": "I"
+      },
+      {
+        "id": 2016,
+        "logprob": -0.7163086,
+        "special": false,
+        "text": " love"
+      },
+      {
+        "id": 272,
+        "logprob": -1.9169922,
+        "special": false,
+        "text": " the"
+      },
+      {
+        "id": 1639,
+        "logprob": -2.03125,
+        "special": false,
+        "text": " fact"
+      }
+    ]
+  },
+  "generated_text": "\n\nI’m a very passionate person. I’m very driven. I’m very determined.\n\nWhat is your favorite thing about being a teacher?\n\nI love the fact"
+}
--- a/integration-tests/models/__snapshots__/test_mamba/test_mamba.json
+++ b/integration-tests/models/__snapshots__/test_mamba/test_mamba.json
@@ -14,55 +14,55 @@
      },
      {
        "id": 187,
-        "logprob": -0.26953125,
+        "logprob": -0.35742188,
        "special": false,
        "text": "\n"
      },
      {
        "id": 30763,
-        "logprob": -1.1953125,
+        "logprob": -1.1015625,
        "special": false,
        "text": "Deep"
      },
      {
        "id": 4715,
-        "logprob": -0.53515625,
+        "logprob": -0.5234375,
        "special": false,
        "text": " learning"
      },
      {
        "id": 310,
-        "logprob": -0.625,
+        "logprob": -0.55078125,
        "special": false,
        "text": " is"
      },
      {
        "id": 247,
-        "logprob": -0.6796875,
+        "logprob": -0.6640625,
        "special": false,
        "text": " a"
      },
      {
        "id": 747,
-        "logprob": -2.0,
+        "logprob": -2.0625,
        "special": false,
        "text": " new"
      },
      {
        "id": 1511,
-        "logprob": -2.3125,
+        "logprob": -2.375,
        "special": false,
        "text": " type"
      },
      {
        "id": 273,
-        "logprob": -0.0028533936,
+        "logprob": -0.0029144287,
        "special": false,
        "text": " of"
      },
      {
        "id": 5145,
-        "logprob": -1.265625,
+        "logprob": -1.2734375,
        "special": false,
        "text": " machine"
      }

--- a/integration-tests/models/__snapshots__/test_mamba/test_mamba_all_params.json
+++ b/integration-tests/models/__snapshots__/test_mamba/test_mamba_all_params.json
@@ -52,7 +52,7 @@
      },
      {
        "id": 9830,
-        "logprob": -1.65625,
+        "logprob": -2.25,
        "special": false,
        "text": " colors"
      },
@@ -64,13 +64,13 @@
      },
      {
        "id": 329,
-        "logprob": -2.4375,
+        "logprob": -2.171875,
        "special": false,
        "text": " A"
      },
      {
        "id": 1180,
-        "logprob": -1.953125,
+        "logprob": -2.046875,
        "special": false,
        "text": " number"
      },

--- a/integration-tests/models/__snapshots__/test_mllama/test_mllama_load.json
+++ b/integration-tests/models/__snapshots__/test_mllama/test_mllama_load.json
+[
+  {
+    "choices": [
+      {
+        "finish_reason": "length",
+        "index": 0,
+        "logprobs": null,
+        "message": {
+          "content": "In a bustling city, a chicken named Cluck",
+          "name": null,
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "usage": null
+      }
+    ],
+    "created": 1727773835,
+    "id": "",
+    "model": "meta-llama/Llama-3.2-11B-Vision-Instruct",
+    "object": "chat.completion",
+    "system_fingerprint": "2.4.1-dev0-native",
+    "usage": {
+      "completion_tokens": 10,
+      "prompt_tokens": 50,
+      "total_tokens": 60
+    }
+  },
+  {
+    "choices": [
+      {
+        "finish_reason": "length",
+        "index": 0,
+        "logprobs": null,
+        "message": {
+          "content": "In a world where even chickens could dream big,",
+          "name": null,
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "usage": null
+      }
+    ],
+    "created": 1727773835,
+    "id": "",
+    "model": "meta-llama/Llama-3.2-11B-Vision-Instruct",
+    "object": "chat.completion",
+    "system_fingerprint": "2.4.1-dev0-native",
+    "usage": {
+      "completion_tokens": 10,
+      "prompt_tokens": 50,
+      "total_tokens": 60
+    }
+  },
+  {
+    "choices": [
+      {
+        "finish_reason": "length",
+        "index": 0,
+        "logprobs": null,
+        "message": {
+          "content": "In a world where even chickens could dream big,",
+          "name": null,
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "usage": null
+      }
+    ],
+    "created": 1727773835,
+    "id": "",
+    "model": "meta-llama/Llama-3.2-11B-Vision-Instruct",
+    "object": "chat.completion",
+    "system_fingerprint": "2.4.1-dev0-native",
+    "usage": {
+      "completion_tokens": 10,
+      "prompt_tokens": 50,
+      "total_tokens": 60
+    }
+  },
+  {
+    "choices": [
+      {
+        "finish_reason": "length",
+        "index": 0,
+        "logprobs": null,
+        "message": {
+          "content": "In a world where even chickens could dream big,",
+          "name": null,
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "usage": null
+      }
+    ],
+    "created": 1727773835,
+    "id": "",
+    "model": "meta-llama/Llama-3.2-11B-Vision-Instruct",
+    "object": "chat.completion",
+    "system_fingerprint": "2.4.1-dev0-native",
+    "usage": {
+      "completion_tokens": 10,
+      "prompt_tokens": 50,
+      "total_tokens": 60
+    }
+  }
+]
--- a/integration-tests/models/__snapshots__/test_mllama/test_mllama_simpl.json
+++ b/integration-tests/models/__snapshots__/test_mllama/test_mllama_simpl.json
+{
+  "choices": [
+    {
+      "finish_reason": "length",
+      "index": 0,
+      "logprobs": null,
+      "message": {
+        "content": "In a bustling city, a chicken named Cluck",
+        "name": null,
+        "role": "assistant",
+        "tool_calls": null
+      },
+      "usage": null
+    }
+  ],
+  "created": 1727556016,
+  "id": "",
+  "model": "meta-llama/Llama-3.2-11B-Vision-Instruct",
+  "object": "chat.completion",
+  "system_fingerprint": "2.4.1-dev0-native",
+  "usage": {
+    "completion_tokens": 10,
+    "prompt_tokens": 50,
+    "total_tokens": 60
+  }
+}
--- a/integration-tests/models/__snapshots__/test_mt0_base/test_mt0_base_all_params.json
+++ b/integration-tests/models/__snapshots__/test_mt0_base/test_mt0_base_all_params.json
@@ -26,19 +26,19 @@
      },
      {
        "id": 259,
-        "logprob": -0.4716797,
+        "logprob": -0.47070312,
        "special": false,
        "text": " "
      },
      {
        "id": 261,
-        "logprob": -0.044677734,
+        "logprob": -0.15307617,
        "special": false,
        "text": ","
      },
      {
        "id": 35622,
-        "logprob": -0.79589844,
+        "logprob": -0.796875,
        "special": false,
        "text": " cloud"
      },
@@ -56,7 +56,7 @@
      },
      {
        "id": 35622,
-        "logprob": -1.1630859,
+        "logprob": -1.2998047,
        "special": false,
        "text": " cloud"
      },
@@ -75,5 +75,5 @@
    ],
    "top_tokens": null
  },
-  "generated_text": "Why is the sky blue?blue sky, clouds and clouds"
+  "generated_text": "Why is the sky blue?blue sky , clouds and clouds"
 }
--- a/integration-tests/models/__snapshots__/test_server_gptq_quantized/test_server_gptq_quantized.json
+++ b/integration-tests/models/__snapshots__/test_server_gptq_quantized/test_server_gptq_quantized.json
+{
+  "details": {
+    "best_of_sequences": null,
+    "finish_reason": "length",
+    "generated_tokens": 10,
+    "prefill": [
+      {
+        "id": 1,
+        "logprob": null,
+        "text": "<s>"
+      },
+      {
+        "id": 4321,
+        "logprob": -9.8359375,
+        "text": "Test"
+      },
+      {
+        "id": 2009,
+        "logprob": -9.6171875,
+        "text": "request"
+      }
+    ],
+    "seed": null,
+    "tokens": [
+      {
+        "id": 13,
+        "logprob": -2.3417969,
+        "special": false,
+        "text": "\n"
+      },
+      {
+        "id": 3057,
+        "logprob": -1.8730469,
+        "special": false,
+        "text": "Test"
+      },
+      {
+        "id": 2009,
+        "logprob": -1.2626953,
+        "special": false,
+        "text": " request"
+      },
+      {
+        "id": 13,
+        "logprob": -1.7060547,
+        "special": false,
+        "text": "\n"
+      },
+      {
+        "id": 3057,
+        "logprob": -1.4482422,
+        "special": false,
+        "text": "Test"
+      },
+      {
+        "id": 2009,
+        "logprob": -0.15246582,
+        "special": false,
+        "text": " request"
+      },
+      {
+        "id": 13,
+        "logprob": -0.796875,
+        "special": false,
+        "text": "\n"
+      },
+      {
+        "id": 3057,
+        "logprob": -0.22766113,
+        "special": false,
+        "text": "Test"
+      },
+      {
+        "id": 2009,
+        "logprob": -0.007045746,
+        "special": false,
+        "text": " request"
+      },
+      {
+        "id": 13,
+        "logprob": -0.021759033,
+        "special": false,
+        "text": "\n"
+      }
+    ],
+    "top_tokens": null
+  },
+  "generated_text": "\nTest request\nTest request\nTest request\n"
+}
--- a/integration-tests/models/__snapshots__/test_server_gptq_quantized/test_server_gptq_quantized_all_params.json
+++ b/integration-tests/models/__snapshots__/test_server_gptq_quantized/test_server_gptq_quantized_all_params.json
+{
+  "details": {
+    "best_of_sequences": null,
+    "finish_reason": "length",
+    "generated_tokens": 10,
+    "prefill": [
+      {
+        "id": 1,
+        "logprob": null,
+        "text": "<s>"
+      },
+      {
+        "id": 4321,
+        "logprob": -9.7890625,
+        "text": "Test"
+      },
+      {
+        "id": 2009,
+        "logprob": -9.625,
+        "text": "request"
+      }
+    ],
+    "seed": 0,
+    "tokens": [
+      {
+        "id": 29899,
+        "logprob": -1.4980469,
+        "special": false,
+        "text": "-"
+      },
+      {
+        "id": 1454,
+        "logprob": -0.19433594,
+        "special": false,
+        "text": "for"
+      },
+      {
+        "id": 29899,
+        "logprob": 0.0,
+        "special": false,
+        "text": "-"
+      },
+      {
+        "id": 9342,
+        "logprob": 0.0,
+        "special": false,
+        "text": "comment"
+      },
+      {
+        "id": 29901,
+        "logprob": 0.0,
+        "special": false,
+        "text": ":"
+      },
+      {
+        "id": 396,
+        "logprob": -0.27392578,
+        "special": false,
+        "text": " #"
+      },
+      {
+        "id": 29906,
+        "logprob": -0.49389648,
+        "special": false,
+        "text": "2"
+      },
+      {
+        "id": 29900,
+        "logprob": -0.81103516,
+        "special": false,
+        "text": "0"
+      },
+      {
+        "id": 29896,
+        "logprob": 0.0,
+        "special": false,
+        "text": "1"
+      },
+      {
+        "id": 29955,
+        "logprob": -1.0800781,
+        "special": false,
+        "text": "7"
+      }
+    ],
+    "top_tokens": null
+  },
+  "generated_text": "Test request-for-comment: #2017"
+}
--- a/integration-tests/models/__snapshots__/test_server_gptq_quantized/test_server_gptq_quantized_load.json
+++ b/integration-tests/models/__snapshots__/test_server_gptq_quantized/test_server_gptq_quantized_load.json
+[
+  {
+    "details": {
+      "best_of_sequences": null,
+      "finish_reason": "length",
+      "generated_tokens": 10,
+      "prefill": [
+        {
+          "id": 1,
+          "logprob": null,
+          "text": "<s>"
+        },
+        {
+          "id": 4321,
+          "logprob": -9.8828125,
+          "text": "Test"
+        },
+        {
+          "id": 2009,
+          "logprob": -9.5859375,
+          "text": "request"
+        }
+      ],
+      "seed": null,
+      "tokens": [
+        {
+          "id": 13,
+          "logprob": -2.3359375,
+          "special": false,
+          "text": "\n"
+        },
+        {
+          "id": 3057,
+          "logprob": -1.8623047,
+          "special": false,
+          "text": "Test"
+        },
+        {
+          "id": 2009,
+          "logprob": -1.2451172,
+          "special": false,
+          "text": " request"
+        },
+        {
+          "id": 13,
+          "logprob": -1.6923828,
+          "special": false,
+          "text": "\n"
+        },
+        {
+          "id": 3057,
+          "logprob": -1.4492188,
+          "special": false,
+          "text": "Test"
+        },
+        {
+          "id": 2009,
+          "logprob": -0.15197754,
+          "special": false,
+          "text": " request"
+        },
+        {
+          "id": 13,
+          "logprob": -0.8022461,
+          "special": false,
+          "text": "\n"
+        },
+        {
+          "id": 3057,
+          "logprob": -0.22583008,
+          "special": false,
+          "text": "Test"
+        },
+        {
+          "id": 2009,
+          "logprob": -0.007095337,
+          "special": false,
+          "text": " request"
+        },
+        {
+          "id": 13,
+          "logprob": -0.021652222,
+          "special": false,
+          "text": "\n"
+        }
+      ],
+      "top_tokens": null
+    },
+    "generated_text": "\nTest request\nTest request\nTest request\n"
+  },
+  {
+    "details": {
+      "best_of_sequences": null,
+      "finish_reason": "length",
+      "generated_tokens": 10,
+      "prefill": [
+        {
+          "id": 1,
+          "logprob": null,
+          "text": "<s>"
+        },
+        {
+          "id": 4321,
+          "logprob": -9.796875,
+          "text": "Test"
+        },
+        {
+          "id": 2009,
+          "logprob": -9.625,
+          "text": "request"
+        }
+      ],
+      "seed": null,
+      "tokens": [
+        {
+          "id": 13,
+          "logprob": -2.3476562,
+          "special": false,
+          "text": "\n"
+        },
+        {
+          "id": 3057,
+          "logprob": -1.8789062,
+          "special": false,
+          "text": "Test"
+        },
+        {
+          "id": 2009,
+          "logprob": -1.2734375,
+          "special": false,
+          "text": " request"
+        },
+        {
+          "id": 13,
+          "logprob": -1.703125,
+          "special": false,
+          "text": "\n"
+        },
+        {
+          "id": 3057,
+          "logprob": -1.4677734,
+          "special": false,
+          "text": "Test"
+        },
+        {
+          "id": 2009,
+          "logprob": -0.15454102,
+          "special": false,
+          "text": " request"
+        },
+        {
+          "id": 13,
+          "logprob": -0.7973633,
+          "special": false,
+          "text": "\n"
+        },
+        {
+          "id": 3057,
+          "logprob": -0.23278809,
+          "special": false,
+          "text": "Test"
+        },
+        {
+          "id": 2009,
+          "logprob": -0.006980896,
+          "special": false,
+          "text": " request"
+        },
+        {
+          "id": 13,
+          "logprob": -0.022033691,
+          "special": false,
+          "text": "\n"
+        }
+      ],
+      "top_tokens": null
+    },
+    "generated_text": "\nTest request\nTest request\nTest request\n"
+  },
+  {
+    "details": {
+      "best_of_sequences": null,
+      "finish_reason": "length",
+      "generated_tokens": 10,
+      "prefill": [
+        {
+          "id": 1,
+          "logprob": null,
+          "text": "<s>"
+        },
+        {
+          "id": 4321,
+          "logprob": -9.9296875,
+          "text": "Test"
+        },
+        {
+          "id": 2009,
+          "logprob": -9.5703125,
+          "text": "request"
+        }
+      ],
+      "seed": null,
+      "tokens": [
+        {
+          "id": 13,
+          "logprob": -2.3203125,
+          "special": false,
+          "text": "\n"
+        },
+        {
+          "id": 3057,
+          "logprob": -1.8486328,
+          "special": false,
+          "text": "Test"
+        },
+        {
+          "id": 2009,
+          "logprob": -1.2480469,
+          "special": false,
+          "text": " request"
+        },
+        {
+          "id": 13,
+          "logprob": -1.7060547,
+          "special": false,
+          "text": "\n"
+        },
+        {
+          "id": 3057,
+          "logprob": -1.4511719,
+          "special": false,
+          "text": "Test"
+        },
+        {
+          "id": 2009,
+          "logprob": -0.1529541,
+          "special": false,
+          "text": " request"
+        },
+        {
+          "id": 13,
+          "logprob": -0.81396484,
+          "special": false,
+          "text": "\n"
+        },
+        {
+          "id": 3057,
+          "logprob": -0.22180176,
+          "special": false,
+          "text": "Test"
+        },
+        {
+          "id": 2009,
+          "logprob": -0.007133484,
+          "special": false,
+          "text": " request"
+        },
+        {
+          "id": 13,
+          "logprob": -0.021835327,
+          "special": false,
+          "text": "\n"
+        }
+      ],
+      "top_tokens": null
+    },
+    "generated_text": "\nTest request\nTest request\nTest request\n"
+  },
+  {
+    "details": {
+      "best_of_sequences": null,
+      "finish_reason": "length",
+      "generated_tokens": 10,
+      "prefill": [
+        {
+          "id": 1,
+          "logprob": null,
+          "text": "<s>"
+        },
+        {
+          "id": 4321,
+          "logprob": -9.84375,
+          "text": "Test"
+        },
+        {
+          "id": 2009,
+          "logprob": -9.6171875,
+          "text": "request"
+        }
+      ],
+      "seed": null,
+      "tokens": [
+        {
+          "id": 13,
+          "logprob": -2.3261719,
+          "special": false,
+          "text": "\n"
+        },
+        {
+          "id": 3057,
+          "logprob": -1.8691406,
+          "special": false,
+          "text": "Test"
+        },
+        {
+          "id": 2009,
+          "logprob": -1.2597656,
+          "special": false,
+          "text": " request"
+        },
+        {
+          "id": 13,
+          "logprob": -1.7070312,
+          "special": false,
+          "text": "\n"
+        },
+        {
+          "id": 3057,
+          "logprob": -1.4550781,
+          "special": false,
+          "text": "Test"
+        },
+        {
+          "id": 2009,
+          "logprob": -0.1538086,
+          "special": false,
+          "text": " request"
+        },
+        {
+          "id": 13,
+          "logprob": -0.79345703,
+          "special": false,
+          "text": "\n"
+        },
+        {
+          "id": 3057,
+          "logprob": -0.22924805,
+          "special": false,
+          "text": "Test"
+        },
+        {
+          "id": 2009,
+          "logprob": -0.0070266724,
+          "special": false,
+          "text": " request"
+        },
+        {
+          "id": 13,
+          "logprob": -0.021942139,
+          "special": false,
+          "text": "\n"
+        }
+      ],
+      "top_tokens": null
+    },
+    "generated_text": "\nTest request\nTest request\nTest request\n"
+  }
+]
--- a/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_insufficient_information.json
+++ b/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_insufficient_information.json
 {
  "choices": [
    {
-      "finish_reason": "eos_token",
+      "finish_reason": "stop",
      "index": 0,
      "logprobs": null,
      "message": {
-        "content": null,
+        "content": "I am an AI assistant",
        "name": null,
        "role": "assistant",
-        "tool_calls": [
-          {
-            "function": {
-              "arguments": {
-                "error": "Cannot get current weather forecast from specified location and temperature unit. Please try again with different options."
-              },
-              "description": null,
-              "name": "notify_error"
-            },
-            "id": 0,
-            "type": "function"
-          }
-        ]
+        "tool_calls": null
      },
      "usage": null
    }
  ],
-  "created": 1712852597,
+  "created": 1728497062,
  "id": "",
-  "model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
-  "object": "text_completion",
-  "system_fingerprint": "1.4.5-native",
+  "model": "meta-llama/Llama-3.1-8B-Instruct",
+  "object": "chat.completion",
+  "system_fingerprint": "2.4.1-dev0-native",
  "usage": {
-    "completion_tokens": 39,
-    "prompt_tokens": 496,
-    "total_tokens": 535
+    "completion_tokens": 23,
+    "prompt_tokens": 604,
+    "total_tokens": 627
  }
 }
--- a/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_insufficient_information_stream.json
+++ b/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_insufficient_information_stream.json
+{
+  "choices": [
+    {
+      "delta": {
+        "content": " assistant",
+        "role": "assistant",
+        "tool_calls": null
+      },
+      "finish_reason": null,
+      "index": 0,
+      "logprobs": null
+    }
+  ],
+  "created": 1728497531,
+  "id": "",
+  "model": "meta-llama/Llama-3.1-8B-Instruct",
+  "object": "chat.completion.chunk",
+  "system_fingerprint": "2.4.1-dev0-native",
+  "usage": null
+}
--- a/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_sea_creatures_stream.json
+++ b/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_sea_creatures_stream.json
+{
+  "choices": [
+    {
+      "delta": {
+        "content": " fans",
+        "role": "assistant",
+        "tool_calls": null
+      },
+      "finish_reason": null,
+      "index": 0,
+      "logprobs": null
+    }
+  ],
+  "created": 1728497461,
+  "id": "",
+  "model": "meta-llama/Llama-3.1-8B-Instruct",
+  "object": "chat.completion.chunk",
+  "system_fingerprint": "2.4.1-dev0-native",
+  "usage": null
+}
--- a/integration-tests/models/test_chat_llama.py
+++ b/integration-tests/models/test_chat_llama.py
 import pytest
-import json
-
-from text_generation.types import GrammarType


 @pytest.fixture(scope="module")
@@ -38,6 +35,6 @@ async def test_flash_llama_simple(flash_llama_chat, response_snapshot):
    print(repr(response.choices[0].message.content))
    assert (
        response.choices[0].message.content
-        == "As of your last question, the weather in Brooklyn, New York, is typically hot and humid throughout the year. The suburbs around New York City are jealously sheltered, and at least in the Lower Bronx, there are very few outdoor environments to explore in the middle of urban confines. In fact, typical times for humidity levels in Brooklyn include:\n\n- Early morning: 80-85% humidity, with occas"
+        == "As of your last question, the weather in Brooklyn, New York, is typically hot and humid throughout the year. The suburbs around New York City are jealously sheltered, and at least in the Lower Bronx, there are very few outdoor environments to appreciate nature.\n\nIn terms of temperature, the warmest times of the year are from June to August, when average high temperatures typically range from around 73°F or 23°C"
    )
    assert response == response_snapshot
--- a/integration-tests/models/test_completion_prompts.py
+++ b/integration-tests/models/test_completion_prompts.py
@@ -3,15 +3,13 @@ import requests
 import json
 from aiohttp import ClientSession

-from text_generation.types import (
-    Completion,
-)
+from text_generation.types import Completion, ChatCompletionChunk


 @pytest.fixture(scope="module")
 def flash_llama_completion_handle(launcher):
    with launcher(
-        "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
+        "meta-llama/Meta-Llama-3.1-8B-Instruct",
    ) as handle:
        yield handle

@@ -34,28 +32,145 @@ def test_flash_llama_completion_single_prompt(
        f"{flash_llama_completion.base_url}/v1/completions",
        json={
            "model": "tgi",
-            "prompt": "Say this is a test",
-            "max_tokens": 5,
-            "seed": 0,
+            "prompt": "What is Deep Learning?",
+            "max_tokens": 10,
+            "temperature": 0.0,
        },
        headers=flash_llama_completion.headers,
        stream=False,
    )
    response = response.json()
    assert len(response["choices"]) == 1
-
+    assert (
+        response["choices"][0]["text"]
+        == " A Beginner’s Guide\nDeep learning is a subset"
+    )
    assert response == response_snapshot


+@pytest.mark.release
+async def test_flash_llama_completion_stream_usage(
+    flash_llama_completion, response_snapshot
+):
+    url = f"{flash_llama_completion.base_url}/v1/chat/completions"
+    request = {
+        "model": "tgi",
+        "messages": [
+            {
+                "role": "user",
+                "content": "What is Deep Learning?",
+            }
+        ],
+        "max_tokens": 10,
+        "temperature": 0.0,
+        "stream_options": {"include_usage": True},
+        "stream": True,
+    }
+    string = ""
+    chunks = []
+    had_usage = False
+    async with ClientSession(headers=flash_llama_completion.headers) as session:
+        async with session.post(url, json=request) as response:
+            # iterate over the stream
+            async for chunk in response.content.iter_any():
+                # remove "data:"
+                chunk = chunk.decode().split("\n\n")
+                # remove "data:" if present
+                chunk = [c.replace("data:", "") for c in chunk]
+                # remove empty strings
+                chunk = [c for c in chunk if c]
+                # remove completion marking chunk
+                chunk = [c for c in chunk if c != " [DONE]"]
+                # parse json
+                chunk = [json.loads(c) for c in chunk]
+
+                for c in chunk:
+                    chunks.append(ChatCompletionChunk(**c))
+                    assert "choices" in c
+                    if len(c["choices"]) == 1:
+                        index = c["choices"][0]["index"]
+                        assert index == 0
+                        string += c["choices"][0]["delta"]["content"]
+
+                        has_usage = c["usage"] is not None
+                        assert not had_usage
+                        if has_usage:
+                            had_usage = True
+                    else:
+                        raise RuntimeError("Expected different payload")
+    assert had_usage
+    assert (
+        string
+        == "**Deep Learning: An Overview**\n=====================================\n\n"
+    )
+    assert chunks == response_snapshot
+
+    request = {
+        "model": "tgi",
+        "messages": [
+            {
+                "role": "user",
+                "content": "What is Deep Learning?",
+            }
+        ],
+        "max_tokens": 10,
+        "temperature": 0.0,
+        "stream": True,
+    }
+    string = ""
+    chunks = []
+    had_usage = False
+    async with ClientSession(headers=flash_llama_completion.headers) as session:
+        async with session.post(url, json=request) as response:
+            # iterate over the stream
+            async for chunk in response.content.iter_any():
+                # remove "data:"
+                chunk = chunk.decode().split("\n\n")
+                # remove "data:" if present
+                chunk = [c.replace("data:", "") for c in chunk]
+                # remove empty strings
+                chunk = [c for c in chunk if c]
+                # remove completion marking chunk
+                chunk = [c for c in chunk if c != " [DONE]"]
+                # parse json
+                chunk = [json.loads(c) for c in chunk]
+
+                for c in chunk:
+                    chunks.append(ChatCompletionChunk(**c))
+                    assert "choices" in c
+                    if len(c["choices"]) == 1:
+                        index = c["choices"][0]["index"]
+                        assert index == 0
+                        string += c["choices"][0]["delta"]["content"]
+
+                        has_usage = c["usage"] is not None
+                        assert not had_usage
+                        if has_usage:
+                            had_usage = True
+                    else:
+                        raise RuntimeError("Expected different payload")
+    assert not had_usage
+    assert (
+        string
+        == "**Deep Learning: An Overview**\n=====================================\n\n"
+    )
+
+
 @pytest.mark.release
 def test_flash_llama_completion_many_prompts(flash_llama_completion, response_snapshot):
    response = requests.post(
        f"{flash_llama_completion.base_url}/v1/completions",
        json={
            "model": "tgi",
-            "prompt": ["Say", "this", "is", "a"],
+            "prompt": [
+                "What is Deep Learning?",
+                "Is water wet?",
+                "What is the capital of France?",
+                "def mai",
+            ],
            "max_tokens": 10,
            "seed": 0,
+            "temperature": 0.0,
        },
        headers=flash_llama_completion.headers,
        stream=False,
@@ -63,9 +178,16 @@ def test_flash_llama_completion_many_prompts(flash_llama_completion, response_sn
    response = response.json()
    assert len(response["choices"]) == 4

-    all_indexes = [choice["index"] for choice in response["choices"]]
+    all_indexes = [(choice["index"], choice["text"]) for choice in response["choices"]]
    all_indexes.sort()
-    assert all_indexes == [0, 1, 2, 3]
+    all_indices, all_strings = zip(*all_indexes)
+    assert list(all_indices) == [0, 1, 2, 3]
+    assert list(all_strings) == [
+        " A Beginner’s Guide\nDeep learning is a subset",
+        " This is a question that has puzzled many people for",
+        " Paris\nWhat is the capital of France?\nThe",
+        'usculas_minusculas(s):\n    """\n',
+    ]

    assert response == response_snapshot

@@ -77,19 +199,21 @@ async def test_flash_llama_completion_many_prompts_stream(
    request = {
        "model": "tgi",
        "prompt": [
-            "What color is the sky?",
+            "What is Deep Learning?",
            "Is water wet?",
            "What is the capital of France?",
            "def mai",
        ],
        "max_tokens": 10,
        "seed": 0,
+        "temperature": 0.0,
        "stream": True,
    }

    url = f"{flash_llama_completion.base_url}/v1/completions"

    chunks = []
+    strings = [""] * 4
    async with ClientSession(headers=flash_llama_completion.headers) as session:
        async with session.post(url, json=request) as response:
            # iterate over the stream
@@ -100,13 +224,23 @@ async def test_flash_llama_completion_many_prompts_stream(
                chunk = [c.replace("data:", "") for c in chunk]
                # remove empty strings
                chunk = [c for c in chunk if c]
+                # remove completion marking chunk
+                chunk = [c for c in chunk if c != " [DONE]"]
                # parse json
                chunk = [json.loads(c) for c in chunk]

                for c in chunk:
                    chunks.append(Completion(**c))
                    assert "choices" in c
-                    assert 0 <= c["choices"][0]["index"] <= 4
+                    index = c["choices"][0]["index"]
+                    assert 0 <= index <= 4
+                    strings[index] += c["choices"][0]["text"]

    assert response.status == 200
+    assert list(strings) == [
+        " A Beginner’s Guide\nDeep learning is a subset",
+        " This is a question that has puzzled many people for",
+        " Paris\nWhat is the capital of France?\nThe",
+        'usculas_minusculas(s):\n    """\n',
+    ]
    assert chunks == response_snapshot
--- a/integration-tests/models/test_flash_deepseek_v2.py
+++ b/integration-tests/models/test_flash_deepseek_v2.py
+import pytest
+
+
+@pytest.fixture(scope="module")
+def flash_deepseek_v2_handle(launcher):
+    with launcher("deepseek-ai/DeepSeek-V2-Lite", num_shard=2) as handle:
+        yield handle
+
+
+@pytest.fixture(scope="module")
+async def flash_deepseek_v2(flash_deepseek_v2_handle):
+    await flash_deepseek_v2_handle.health(300)
+    return flash_deepseek_v2_handle.client
+
+
+@pytest.mark.release
+@pytest.mark.asyncio
+@pytest.mark.private
+async def test_flash_deepseek_v2(flash_deepseek_v2, response_snapshot):
+    response = await flash_deepseek_v2.generate(
+        "Test request", max_new_tokens=10, decoder_input_details=True
+    )
+
+    assert response == response_snapshot
+
+
+@pytest.mark.release
+@pytest.mark.asyncio
+@pytest.mark.private
+async def test_flash_deepseek_v2_all_params(flash_deepseek_v2, response_snapshot):
+    response = await flash_deepseek_v2.generate(
+        "Test request",
+        max_new_tokens=10,
+        repetition_penalty=1.2,
+        return_full_text=True,
+        stop_sequences=["test"],
+        temperature=0.5,
+        top_p=0.9,
+        top_k=10,
+        truncate=5,
+        typical_p=0.9,
+        watermark=True,
+        decoder_input_details=True,
+        seed=0,
+    )
+
+    assert response == response_snapshot
+
+
+@pytest.mark.release
+@pytest.mark.asyncio
+@pytest.mark.private
+async def test_flash_deepseek_v2_load(
+    flash_deepseek_v2, generate_load, response_snapshot
+):
+    responses = await generate_load(
+        flash_deepseek_v2, "Test request", max_new_tokens=10, n=4
+    )
+
+    assert len(responses) == 4
+    assert all([r.generated_text == responses[0].generated_text for r in responses])
+
+    assert responses == response_snapshot
--- a/integration-tests/models/test_flash_gemma.py
+++ b/integration-tests/models/test_flash_gemma.py
@@ -16,7 +16,7 @@ async def flash_gemma(flash_gemma_handle):
 @pytest.mark.release
 @pytest.mark.asyncio
 @pytest.mark.private
-async def test_flash_gemma(flash_gemma, response_snapshot):
+async def test_flash_gemma_simple(flash_gemma, response_snapshot):
    response = await flash_gemma.generate(
        "Test request", max_new_tokens=10, decoder_input_details=True
    )