Auto max prefill (#2797)

* Attempt at automatic max batch prefill. * Taking into account number of shards. * Adding more cards. * Adding A100 + H100 * Adding a few more cards. * Logprobs cost too much. * h100 better name, and keep factor of 2 * Damn inflated sparse tflops. * Typo in h100. * Updated the flops calculation (checked with fvcore). * chunking by default. * Fix prefix caching for chat completion since we removed logprobs. * More tests. * Dropping all the prefill logprobs. * Add a flag that enables users to get logprobs back. * Repairing prompt token counting. * Fixing a few tests. * Remove some scaffolding. * Attempting to reduces the issues (workarounds for now).

Auto max prefill (#2797)
* Attempt at automatic max batch prefill. * Taking into account number of shards. * Adding more cards. * Adding A100 + H100 * Adding a few more cards. * Logprobs cost too much. * h100 better name, and keep factor of 2 * Damn inflated sparse tflops. * Typo in h100. * Updated the flops calculation (checked with fvcore). * chunking by default. * Fix prefix caching for chat completion since we removed logprobs. * More tests. * Dropping all the prefill logprobs. * Add a flag that enables users to get logprobs back. * Repairing prompt token counting. * Fixing a few tests. * Remove some scaffolding. * Attempting to reduces the issues (workarounds for now).
5df80590 · Nicolas Patry · GitHub · 8c3669b2 · 5df80590 · 5df80590
Unverified Commit 5df80590 authored Dec 06, 2024 by Nicolas Patry Committed by GitHub Dec 06, 2024
20 changed files
--- a/docs/source/reference/launcher.md
+++ b/docs/source/reference/launcher.md
@@ -467,6 +467,16 @@ Options:
          [env: PAYLOAD_LIMIT=]
          [default: 2000000]
+```
+## ENABLE_PREFILL_LOGPROBS
+```shell
+      --enable-prefill-logprobs
+          Enables prefill logprobs
+          Logprobs in the prompt are deactivated by default because they consume a large amount of VRAM (especially for long prompts). Using this flag reallows users to ask for them.
+          [env: ENABLE_PREFILL_LOGPROBS=]
 ```
 ## HELP
 ```shell

--- a/integration-tests/models/__snapshots__/test.py
+++ b/integration-tests/models/__snapshots__/test.py
+import os
+import json
+for root, dirs, files in os.walk("."):
+    for filename in files:
+        if filename.endswith(".json"):
+            with open(os.path.join(root, filename), "r") as f:
+                data = json.load(f)
+            print(os.path.join(root, filename))
+            try:
+                if filename.endswith("_load.json"):
+                    for i in range(len(data)):
+                        data[i]["details"]["prefill"] = []
+                else:
+                    data["details"]["prefill"] = []
+            except Exception:
+                pass
+            with open(os.path.join(root, filename), "w") as f:
+                json.dump(data, f, indent=2, ensure_ascii=False)
--- a/integration-tests/models/__snapshots__/test_compressed_tensors_w8a8_int/test_compressed_tensors_w8a8_int.json
+++ b/integration-tests/models/__snapshots__/test_compressed_tensors_w8a8_int/test_compressed_tensors_w8a8_int.json
@@ -3,38 +3,7 @@
    "best_of_sequences": null,
    "finish_reason": "length",
    "generated_tokens": 10,
-    "prefill": [
+    "prefill": [],
-      {
-        "id": 128000,
-        "logprob": null,
-        "text": "<|begin_of_text|>"
-      },
-      {
-        "id": 3923,
-        "logprob": -6.3867188,
-        "text": "What"
-      },
-      {
-        "id": 374,
-        "logprob": -1.1318359,
-        "text": " is"
-      },
-      {
-        "id": 5655,
-        "logprob": -9.6875,
-        "text": " deep"
-      },
-      {
-        "id": 6975,
-        "logprob": -1.3007812,
-        "text": " learning"
-      },
-      {
-        "id": 30,
-        "logprob": -2.4902344,
-        "text": "?"
-      }
-    ],
    "seed": null,
    "tokens": [
      {

--- a/integration-tests/models/__snapshots__/test_compressed_tensors_w8a8_int/test_compressed_tensors_w8a8_int_all_params.json
+++ b/integration-tests/models/__snapshots__/test_compressed_tensors_w8a8_int/test_compressed_tensors_w8a8_int_all_params.json
@@ -3,33 +3,7 @@
    "best_of_sequences": null,
    "finish_reason": "length",
    "generated_tokens": 10,
-    "prefill": [
+    "prefill": [],
-      {
-        "id": 128000,
-        "logprob": null,
-        "text": "<|begin_of_text|>"
-      },
-      {
-        "id": 3923,
-        "logprob": -6.3867188,
-        "text": "What"
-      },
-      {
-        "id": 374,
-        "logprob": -1.1318359,
-        "text": " is"
-      },
-      {
-        "id": 5655,
-        "logprob": -9.6875,
-        "text": " deep"
-      },
-      {
-        "id": 6975,
-        "logprob": -1.3007812,
-        "text": " learning"
-      }
-    ],
    "seed": 0,
    "tokens": [
      {

--- a/integration-tests/models/__snapshots__/test_compressed_tensors_w8a8_int/test_compressed_tensors_w8a8_int_load.json
+++ b/integration-tests/models/__snapshots__/test_compressed_tensors_w8a8_int/test_compressed_tensors_w8a8_int_load.json
@@ -4,38 +4,7 @@
      "best_of_sequences": null,
      "finish_reason": "length",
      "generated_tokens": 10,
-      "prefill": [
+      "prefill": [],
-        {
-          "id": 128000,
-          "logprob": null,
-          "text": "<|begin_of_text|>"
-        },
-        {
-          "id": 3923,
-          "logprob": -6.3867188,
-          "text": "What"
-        },
-        {
-          "id": 374,
-          "logprob": -1.1318359,
-          "text": " is"
-        },
-        {
-          "id": 5655,
-          "logprob": -9.6875,
-          "text": " deep"
-        },
-        {
-          "id": 6975,
-          "logprob": -1.3007812,
-          "text": " learning"
-        },
-        {
-          "id": 30,
-          "logprob": -2.4902344,
-          "text": "?"
-        }
-      ],
      "seed": null,
      "tokens": [
        {
@@ -108,38 +77,7 @@
      "best_of_sequences": null,
      "finish_reason": "length",
      "generated_tokens": 10,
-      "prefill": [
+      "prefill": [],
-        {
-          "id": 128000,
-          "logprob": null,
-          "text": "<|begin_of_text|>"
-        },
-        {
-          "id": 3923,
-          "logprob": -6.3867188,
-          "text": "What"
-        },
-        {
-          "id": 374,
-          "logprob": -1.1318359,
-          "text": " is"
-        },
-        {
-          "id": 5655,
-          "logprob": -9.6875,
-          "text": " deep"
-        },
-        {
-          "id": 6975,
-          "logprob": -1.3007812,
-          "text": " learning"
-        },
-        {
-          "id": 30,
-          "logprob": -2.4902344,
-          "text": "?"
-        }
-      ],
      "seed": null,
      "tokens": [
        {
@@ -212,38 +150,7 @@
      "best_of_sequences": null,
      "finish_reason": "length",
      "generated_tokens": 10,
-      "prefill": [
+      "prefill": [],
-        {
-          "id": 128000,
-          "logprob": null,
-          "text": "<|begin_of_text|>"
-        },
-        {
-          "id": 3923,
-          "logprob": -6.3867188,
-          "text": "What"
-        },
-        {
-          "id": 374,
-          "logprob": -1.1318359,
-          "text": " is"
-        },
-        {
-          "id": 5655,
-          "logprob": -9.6875,
-          "text": " deep"
-        },
-        {
-          "id": 6975,
-          "logprob": -1.3007812,
-          "text": " learning"
-        },
-        {
-          "id": 30,
-          "logprob": -2.4902344,
-          "text": "?"
-        }
-      ],
      "seed": null,
      "tokens": [
        {
@@ -316,38 +223,7 @@
      "best_of_sequences": null,
      "finish_reason": "length",
      "generated_tokens": 10,
-      "prefill": [
+      "prefill": [],
-        {
-          "id": 128000,
-          "logprob": null,
-          "text": "<|begin_of_text|>"
-        },
-        {
-          "id": 3923,
-          "logprob": -6.3867188,
-          "text": "What"
-        },
-        {
-          "id": 374,
-          "logprob": -1.1318359,
-          "text": " is"
-        },
-        {
-          "id": 5655,
-          "logprob": -9.6875,
-          "text": " deep"
-        },
-        {
-          "id": 6975,
-          "logprob": -1.3007812,
-          "text": " learning"
-        },
-        {
-          "id": 30,
-          "logprob": -2.4902344,
-          "text": "?"
-        }
-      ],
      "seed": null,
      "tokens": [
        {

--- a/integration-tests/models/__snapshots__/test_compressed_tensors_w8a8_int_dynamic_weight/test_compressed_tensors_w8a8_int_dynamic_weight.json
+++ b/integration-tests/models/__snapshots__/test_compressed_tensors_w8a8_int_dynamic_weight/test_compressed_tensors_w8a8_int_dynamic_weight.json
@@ -3,33 +3,7 @@
    "best_of_sequences": null,
    "finish_reason": "length",
    "generated_tokens": 10,
-    "prefill": [
+    "prefill": [],
-      {
-        "id": 3838,
-        "logprob": null,
-        "text": "What"
-      },
-      {
-        "id": 374,
-        "logprob": -8.59375,
-        "text": " is"
-      },
-      {
-        "id": 5538,
-        "logprob": -10.921875,
-        "text": " deep"
-      },
-      {
-        "id": 6832,
-        "logprob": -0.56347656,
-        "text": " learning"
-      },
-      {
-        "id": 30,
-        "logprob": -1.5,
-        "text": "?"
-      }
-    ],
    "seed": null,
    "tokens": [
      {

--- a/integration-tests/models/__snapshots__/test_compressed_tensors_w8a8_int_dynamic_weight/test_compressed_tensors_w8a8_int_dynamic_weight_all_params.json
+++ b/integration-tests/models/__snapshots__/test_compressed_tensors_w8a8_int_dynamic_weight/test_compressed_tensors_w8a8_int_dynamic_weight_all_params.json
@@ -3,28 +3,7 @@
    "best_of_sequences": null,
    "finish_reason": "length",
    "generated_tokens": 10,
-    "prefill": [
+    "prefill": [],
-      {
-        "id": 3838,
-        "logprob": null,
-        "text": "What"
-      },
-      {
-        "id": 374,
-        "logprob": -8.59375,
-        "text": " is"
-      },
-      {
-        "id": 5538,
-        "logprob": -10.921875,
-        "text": " deep"
-      },
-      {
-        "id": 6832,
-        "logprob": -0.56347656,
-        "text": " learning"
-      }
-    ],
    "seed": 0,
    "tokens": [
      {

--- a/integration-tests/models/__snapshots__/test_compressed_tensors_w8a8_int_dynamic_weight/test_compressed_tensors_w8a8_int_dynamic_weight_load.json
+++ b/integration-tests/models/__snapshots__/test_compressed_tensors_w8a8_int_dynamic_weight/test_compressed_tensors_w8a8_int_dynamic_weight_load.json
@@ -4,33 +4,7 @@
      "best_of_sequences": null,
      "finish_reason": "length",
      "generated_tokens": 10,
-      "prefill": [
+      "prefill": [],
-        {
-          "id": 3838,
-          "logprob": null,
-          "text": "What"
-        },
-        {
-          "id": 374,
-          "logprob": -8.59375,
-          "text": " is"
-        },
-        {
-          "id": 5538,
-          "logprob": -10.921875,
-          "text": " deep"
-        },
-        {
-          "id": 6832,
-          "logprob": -0.56347656,
-          "text": " learning"
-        },
-        {
-          "id": 30,
-          "logprob": -1.5,
-          "text": "?"
-        }
-      ],
      "seed": null,
      "tokens": [
        {
@@ -103,33 +77,7 @@
      "best_of_sequences": null,
      "finish_reason": "length",
      "generated_tokens": 10,
-      "prefill": [
+      "prefill": [],
-        {
-          "id": 3838,
-          "logprob": null,
-          "text": "What"
-        },
-        {
-          "id": 374,
-          "logprob": -8.59375,
-          "text": " is"
-        },
-        {
-          "id": 5538,
-          "logprob": -10.921875,
-          "text": " deep"
-        },
-        {
-          "id": 6832,
-          "logprob": -0.56347656,
-          "text": " learning"
-        },
-        {
-          "id": 30,
-          "logprob": -1.5,
-          "text": "?"
-        }
-      ],
      "seed": null,
      "tokens": [
        {
@@ -202,33 +150,7 @@
      "best_of_sequences": null,
      "finish_reason": "length",
      "generated_tokens": 10,
-      "prefill": [
+      "prefill": [],
-        {
-          "id": 3838,
-          "logprob": null,
-          "text": "What"
-        },
-        {
-          "id": 374,
-          "logprob": -8.59375,
-          "text": " is"
-        },
-        {
-          "id": 5538,
-          "logprob": -10.921875,
-          "text": " deep"
-        },
-        {
-          "id": 6832,
-          "logprob": -0.56347656,
-          "text": " learning"
-        },
-        {
-          "id": 30,
-          "logprob": -1.5,
-          "text": "?"
-        }
-      ],
      "seed": null,
      "tokens": [
        {
@@ -301,33 +223,7 @@
      "best_of_sequences": null,
      "finish_reason": "length",
      "generated_tokens": 10,
-      "prefill": [
+      "prefill": [],
-        {
-          "id": 3838,
-          "logprob": null,
-          "text": "What"
-        },
-        {
-          "id": 374,
-          "logprob": -8.59375,
-          "text": " is"
-        },
-        {
-          "id": 5538,
-          "logprob": -10.921875,
-          "text": " deep"
-        },
-        {
-          "id": 6832,
-          "logprob": -0.56347656,
-          "text": " learning"
-        },
-        {
-          "id": 30,
-          "logprob": -1.5,
-          "text": "?"
-        }
-      ],
      "seed": null,
      "tokens": [
        {

--- a/integration-tests/models/__snapshots__/test_compressed_tensors_w8an_fp/test_compressed_tensors_w8an.json
+++ b/integration-tests/models/__snapshots__/test_compressed_tensors_w8an_fp/test_compressed_tensors_w8an.json
@@ -3,38 +3,7 @@
    "best_of_sequences": null,
    "finish_reason": "length",
    "generated_tokens": 10,
-    "prefill": [
+    "prefill": [],
-      {
-        "id": 128000,
-        "logprob": null,
-        "text": "<|begin_of_text|>"
-      },
-      {
-        "id": 3923,
-        "logprob": -7.609375,
-        "text": "What"
-      },
-      {
-        "id": 374,
-        "logprob": -0.92529297,
-        "text": " is"
-      },
-      {
-        "id": 5655,
-        "logprob": -10.0,
-        "text": " deep"
-      },
-      {
-        "id": 6975,
-        "logprob": -0.94628906,
-        "text": " learning"
-      },
-      {
-        "id": 30,
-        "logprob": -2.9042969,
-        "text": "?"
-      }
-    ],
    "seed": null,
    "tokens": [
      {

--- a/integration-tests/models/__snapshots__/test_compressed_tensors_w8an_fp/test_compressed_tensors_w8an_all_params.json
+++ b/integration-tests/models/__snapshots__/test_compressed_tensors_w8an_fp/test_compressed_tensors_w8an_all_params.json
@@ -3,33 +3,7 @@
    "best_of_sequences": null,
    "finish_reason": "length",
    "generated_tokens": 10,
-    "prefill": [
+    "prefill": [],
-      {
-        "id": 128000,
-        "logprob": null,
-        "text": "<|begin_of_text|>"
-      },
-      {
-        "id": 3923,
-        "logprob": -7.609375,
-        "text": "What"
-      },
-      {
-        "id": 374,
-        "logprob": -0.92529297,
-        "text": " is"
-      },
-      {
-        "id": 5655,
-        "logprob": -10.0,
-        "text": " deep"
-      },
-      {
-        "id": 6975,
-        "logprob": -0.94628906,
-        "text": " learning"
-      }
-    ],
    "seed": 0,
    "tokens": [
      {

--- a/integration-tests/models/__snapshots__/test_compressed_tensors_w8an_fp/test_compressed_tensors_w8an_load.json
+++ b/integration-tests/models/__snapshots__/test_compressed_tensors_w8an_fp/test_compressed_tensors_w8an_load.json
@@ -4,38 +4,7 @@
      "best_of_sequences": null,
      "finish_reason": "length",
      "generated_tokens": 10,
-      "prefill": [
+      "prefill": [],
-        {
-          "id": 128000,
-          "logprob": null,
-          "text": "<|begin_of_text|>"
-        },
-        {
-          "id": 3923,
-          "logprob": -7.609375,
-          "text": "What"
-        },
-        {
-          "id": 374,
-          "logprob": -0.92529297,
-          "text": " is"
-        },
-        {
-          "id": 5655,
-          "logprob": -10.0,
-          "text": " deep"
-        },
-        {
-          "id": 6975,
-          "logprob": -0.94628906,
-          "text": " learning"
-        },
-        {
-          "id": 30,
-          "logprob": -2.9042969,
-          "text": "?"
-        }
-      ],
      "seed": null,
      "tokens": [
        {
@@ -108,38 +77,7 @@
      "best_of_sequences": null,
      "finish_reason": "length",
      "generated_tokens": 10,
-      "prefill": [
+      "prefill": [],
-        {
-          "id": 128000,
-          "logprob": null,
-          "text": "<|begin_of_text|>"
-        },
-        {
-          "id": 3923,
-          "logprob": -7.6054688,
-          "text": "What"
-        },
-        {
-          "id": 374,
-          "logprob": -0.92089844,
-          "text": " is"
-        },
-        {
-          "id": 5655,
-          "logprob": -10.0,
-          "text": " deep"
-        },
-        {
-          "id": 6975,
-          "logprob": -0.94433594,
-          "text": " learning"
-        },
-        {
-          "id": 30,
-          "logprob": -2.90625,
-          "text": "?"
-        }
-      ],
      "seed": null,
      "tokens": [
        {
@@ -212,38 +150,7 @@
      "best_of_sequences": null,
      "finish_reason": "length",
      "generated_tokens": 10,
-      "prefill": [
+      "prefill": [],
-        {
-          "id": 128000,
-          "logprob": null,
-          "text": "<|begin_of_text|>"
-        },
-        {
-          "id": 3923,
-          "logprob": -7.6054688,
-          "text": "What"
-        },
-        {
-          "id": 374,
-          "logprob": -0.92089844,
-          "text": " is"
-        },
-        {
-          "id": 5655,
-          "logprob": -10.0,
-          "text": " deep"
-        },
-        {
-          "id": 6975,
-          "logprob": -0.94433594,
-          "text": " learning"
-        },
-        {
-          "id": 30,
-          "logprob": -2.90625,
-          "text": "?"
-        }
-      ],
      "seed": null,
      "tokens": [
        {
@@ -316,38 +223,7 @@
      "best_of_sequences": null,
      "finish_reason": "length",
      "generated_tokens": 10,
-      "prefill": [
+      "prefill": [],
-        {
-          "id": 128000,
-          "logprob": null,
-          "text": "<|begin_of_text|>"
-        },
-        {
-          "id": 3923,
-          "logprob": -7.6054688,
-          "text": "What"
-        },
-        {
-          "id": 374,
-          "logprob": -0.92089844,
-          "text": " is"
-        },
-        {
-          "id": 5655,
-          "logprob": -10.0,
-          "text": " deep"
-        },
-        {
-          "id": 6975,
-          "logprob": -0.94433594,
-          "text": " learning"
-        },
-        {
-          "id": 30,
-          "logprob": -2.90625,
-          "text": "?"
-        }
-      ],
      "seed": null,
      "tokens": [
        {

--- a/integration-tests/models/__snapshots__/test_compressed_tensors_wna16_int/test_compressed_tensors_wna16.json
+++ b/integration-tests/models/__snapshots__/test_compressed_tensors_wna16_int/test_compressed_tensors_wna16.json
@@ -3,38 +3,7 @@
    "best_of_sequences": null,
    "finish_reason": "length",
    "generated_tokens": 10,
-    "prefill": [
+    "prefill": [],
-      {
-        "id": 2,
-        "logprob": null,
-        "text": "<bos>"
-      },
-      {
-        "id": 1841,
-        "logprob": -5.46875,
-        "text": "What"
-      },
-      {
-        "id": 603,
-        "logprob": -0.69140625,
-        "text": " is"
-      },
-      {
-        "id": 5271,
-        "logprob": -12.0,
-        "text": " deep"
-      },
-      {
-        "id": 6044,
-        "logprob": -0.32226562,
-        "text": " learning"
-      },
-      {
-        "id": 235336,
-        "logprob": -0.33203125,
-        "text": "?"
-      }
-    ],
    "seed": null,
    "tokens": [
      {

--- a/integration-tests/models/__snapshots__/test_compressed_tensors_wna16_int/test_compressed_tensors_wna16_all_params.json
+++ b/integration-tests/models/__snapshots__/test_compressed_tensors_wna16_int/test_compressed_tensors_wna16_all_params.json
@@ -3,33 +3,7 @@
    "best_of_sequences": null,
    "finish_reason": "length",
    "generated_tokens": 10,
-    "prefill": [
+    "prefill": [],
-      {
-        "id": 2,
-        "logprob": null,
-        "text": "<bos>"
-      },
-      {
-        "id": 1841,
-        "logprob": -5.46875,
-        "text": "What"
-      },
-      {
-        "id": 603,
-        "logprob": -0.69140625,
-        "text": " is"
-      },
-      {
-        "id": 5271,
-        "logprob": -12.0,
-        "text": " deep"
-      },
-      {
-        "id": 6044,
-        "logprob": -0.32226562,
-        "text": " learning"
-      }
-    ],
    "seed": 0,
    "tokens": [
      {

--- a/integration-tests/models/__snapshots__/test_compressed_tensors_wna16_int/test_compressed_tensors_wna16_load.json
+++ b/integration-tests/models/__snapshots__/test_compressed_tensors_wna16_int/test_compressed_tensors_wna16_load.json
@@ -4,38 +4,7 @@
      "best_of_sequences": null,
      "finish_reason": "length",
      "generated_tokens": 10,
-      "prefill": [
+      "prefill": [],
-        {
-          "id": 2,
-          "logprob": null,
-          "text": "<bos>"
-        },
-        {
-          "id": 1841,
-          "logprob": -5.46875,
-          "text": "What"
-        },
-        {
-          "id": 603,
-          "logprob": -0.69140625,
-          "text": " is"
-        },
-        {
-          "id": 5271,
-          "logprob": -12.0,
-          "text": " deep"
-        },
-        {
-          "id": 6044,
-          "logprob": -0.32226562,
-          "text": " learning"
-        },
-        {
-          "id": 235336,
-          "logprob": -0.33203125,
-          "text": "?"
-        }
-      ],
      "seed": null,
      "tokens": [
        {
@@ -108,38 +77,7 @@
      "best_of_sequences": null,
      "finish_reason": "length",
      "generated_tokens": 10,
-      "prefill": [
+      "prefill": [],
-        {
-          "id": 2,
-          "logprob": null,
-          "text": "<bos>"
-        },
-        {
-          "id": 1841,
-          "logprob": -5.46875,
-          "text": "What"
-        },
-        {
-          "id": 603,
-          "logprob": -0.71484375,
-          "text": " is"
-        },
-        {
-          "id": 5271,
-          "logprob": -12.0,
-          "text": " deep"
-        },
-        {
-          "id": 6044,
-          "logprob": -0.30859375,
-          "text": " learning"
-        },
-        {
-          "id": 235336,
-          "logprob": -0.3359375,
-          "text": "?"
-        }
-      ],
      "seed": null,
      "tokens": [
        {
@@ -212,38 +150,7 @@
      "best_of_sequences": null,
      "finish_reason": "length",
      "generated_tokens": 10,
-      "prefill": [
+      "prefill": [],
-        {
-          "id": 2,
-          "logprob": null,
-          "text": "<bos>"
-        },
-        {
-          "id": 1841,
-          "logprob": -5.46875,
-          "text": "What"
-        },
-        {
-          "id": 603,
-          "logprob": -0.71484375,
-          "text": " is"
-        },
-        {
-          "id": 5271,
-          "logprob": -12.0,
-          "text": " deep"
-        },
-        {
-          "id": 6044,
-          "logprob": -0.30859375,
-          "text": " learning"
-        },
-        {
-          "id": 235336,
-          "logprob": -0.3359375,
-          "text": "?"
-        }
-      ],
      "seed": null,
      "tokens": [
        {
@@ -316,38 +223,7 @@
      "best_of_sequences": null,
      "finish_reason": "length",
      "generated_tokens": 10,
-      "prefill": [
+      "prefill": [],
-        {
-          "id": 2,
-          "logprob": null,
-          "text": "<bos>"
-        },
-        {
-          "id": 1841,
-          "logprob": -5.46875,
-          "text": "What"
-        },
-        {
-          "id": 603,
-          "logprob": -0.71484375,
-          "text": " is"
-        },
-        {
-          "id": 5271,
-          "logprob": -12.0,
-          "text": " deep"
-        },
-        {
-          "id": 6044,
-          "logprob": -0.30859375,
-          "text": " learning"
-        },
-        {
-          "id": 235336,
-          "logprob": -0.3359375,
-          "text": "?"
-        }
-      ],
      "seed": null,
      "tokens": [
        {

--- a/integration-tests/models/__snapshots__/test_compressed_tensors_wna16_int_24/test_compressed_tensors_wna16_int_24.json
+++ b/integration-tests/models/__snapshots__/test_compressed_tensors_wna16_int_24/test_compressed_tensors_wna16_int_24.json
@@ -3,38 +3,7 @@
    "best_of_sequences": null,
    "finish_reason": "length",
    "generated_tokens": 10,
-    "prefill": [
+    "prefill": [],
-      {
-        "id": 128000,
-        "logprob": null,
-        "text": "<|begin_of_text|>"
-      },
-      {
-        "id": 3923,
-        "logprob": -7.5390625,
-        "text": "What"
-      },
-      {
-        "id": 374,
-        "logprob": -0.86035156,
-        "text": " is"
-      },
-      {
-        "id": 5655,
-        "logprob": -8.828125,
-        "text": " deep"
-      },
-      {
-        "id": 6975,
-        "logprob": -1.4912109,
-        "text": " learning"
-      },
-      {
-        "id": 30,
-        "logprob": -2.1152344,
-        "text": "?"
-      }
-    ],
    "seed": null,
    "tokens": [
      {

--- a/integration-tests/models/__snapshots__/test_compressed_tensors_wna16_int_24/test_compressed_tensors_wna16_int_24_all_params.json
+++ b/integration-tests/models/__snapshots__/test_compressed_tensors_wna16_int_24/test_compressed_tensors_wna16_int_24_all_params.json
@@ -3,33 +3,7 @@
    "best_of_sequences": null,
    "finish_reason": "length",
    "generated_tokens": 10,
-    "prefill": [
+    "prefill": [],
-      {
-        "id": 128000,
-        "logprob": null,
-        "text": "<|begin_of_text|>"
-      },
-      {
-        "id": 3923,
-        "logprob": -7.5390625,
-        "text": "What"
-      },
-      {
-        "id": 374,
-        "logprob": -0.86035156,
-        "text": " is"
-      },
-      {
-        "id": 5655,
-        "logprob": -8.828125,
-        "text": " deep"
-      },
-      {
-        "id": 6975,
-        "logprob": -1.4912109,
-        "text": " learning"
-      }
-    ],
    "seed": 0,
    "tokens": [
      {

--- a/integration-tests/models/__snapshots__/test_compressed_tensors_wna16_int_24/test_compressed_tensors_wna16_int_24_load.json
+++ b/integration-tests/models/__snapshots__/test_compressed_tensors_wna16_int_24/test_compressed_tensors_wna16_int_24_load.json
@@ -4,38 +4,7 @@
      "best_of_sequences": null,
      "finish_reason": "length",
      "generated_tokens": 10,
-      "prefill": [
+      "prefill": [],
-        {
-          "id": 128000,
-          "logprob": null,
-          "text": "<|begin_of_text|>"
-        },
-        {
-          "id": 3923,
-          "logprob": -7.5390625,
-          "text": "What"
-        },
-        {
-          "id": 374,
-          "logprob": -0.86035156,
-          "text": " is"
-        },
-        {
-          "id": 5655,
-          "logprob": -8.828125,
-          "text": " deep"
-        },
-        {
-          "id": 6975,
-          "logprob": -1.4912109,
-          "text": " learning"
-        },
-        {
-          "id": 30,
-          "logprob": -2.1152344,
-          "text": "?"
-        }
-      ],
      "seed": null,
      "tokens": [
        {
@@ -108,38 +77,7 @@
      "best_of_sequences": null,
      "finish_reason": "length",
      "generated_tokens": 10,
-      "prefill": [
+      "prefill": [],
-        {
-          "id": 128000,
-          "logprob": null,
-          "text": "<|begin_of_text|>"
-        },
-        {
-          "id": 3923,
-          "logprob": -7.5351562,
-          "text": "What"
-        },
-        {
-          "id": 374,
-          "logprob": -0.85791016,
-          "text": " is"
-        },
-        {
-          "id": 5655,
-          "logprob": -8.828125,
-          "text": " deep"
-        },
-        {
-          "id": 6975,
-          "logprob": -1.4882812,
-          "text": " learning"
-        },
-        {
-          "id": 30,
-          "logprob": -2.1210938,
-          "text": "?"
-        }
-      ],
      "seed": null,
      "tokens": [
        {
@@ -212,38 +150,7 @@
      "best_of_sequences": null,
      "finish_reason": "length",
      "generated_tokens": 10,
-      "prefill": [
+      "prefill": [],
-        {
-          "id": 128000,
-          "logprob": null,
-          "text": "<|begin_of_text|>"
-        },
-        {
-          "id": 3923,
-          "logprob": -7.5351562,
-          "text": "What"
-        },
-        {
-          "id": 374,
-          "logprob": -0.85791016,
-          "text": " is"
-        },
-        {
-          "id": 5655,
-          "logprob": -8.828125,
-          "text": " deep"
-        },
-        {
-          "id": 6975,
-          "logprob": -1.4882812,
-          "text": " learning"
-        },
-        {
-          "id": 30,
-          "logprob": -2.1210938,
-          "text": "?"
-        }
-      ],
      "seed": null,
      "tokens": [
        {
@@ -316,38 +223,7 @@
      "best_of_sequences": null,
      "finish_reason": "length",
      "generated_tokens": 10,
-      "prefill": [
+      "prefill": [],
-        {
-          "id": 128000,
-          "logprob": null,
-          "text": "<|begin_of_text|>"
-        },
-        {
-          "id": 3923,
-          "logprob": -7.5351562,
-          "text": "What"
-        },
-        {
-          "id": 374,
-          "logprob": -0.85791016,
-          "text": " is"
-        },
-        {
-          "id": 5655,
-          "logprob": -8.828125,
-          "text": " deep"
-        },
-        {
-          "id": 6975,
-          "logprob": -1.4882812,
-          "text": " learning"
-        },
-        {
-          "id": 30,
-          "logprob": -2.1210938,
-          "text": "?"
-        }
-      ],
      "seed": null,
      "tokens": [
        {

--- a/integration-tests/models/__snapshots__/test_flash_awq/test_flash_llama_awq.json
+++ b/integration-tests/models/__snapshots__/test_flash_awq/test_flash_llama_awq.json
@@ -3,38 +3,7 @@
    "best_of_sequences": null,
    "finish_reason": "length",
    "generated_tokens": 10,
-    "prefill": [
+    "prefill": [],
-      {
-        "id": 1,
-        "logprob": null,
-        "text": "<s>"
-      },
-      {
-        "id": 1724,
-        "logprob": -7.703125,
-        "text": "What"
-      },
-      {
-        "id": 338,
-        "logprob": -1.4765625,
-        "text": "is"
-      },
-      {
-        "id": 21784,
-        "logprob": -9.390625,
-        "text": "Deep"
-      },
-      {
-        "id": 29257,
-        "logprob": -1.8583984,
-        "text": "Learning"
-      },
-      {
-        "id": 29973,
-        "logprob": -0.7548828,
-        "text": "?"
-      }
-    ],
    "seed": null,
    "tokens": [
      {

--- a/integration-tests/models/__snapshots__/test_flash_awq/test_flash_llama_awq_all_params.json
+++ b/integration-tests/models/__snapshots__/test_flash_awq/test_flash_llama_awq_all_params.json
@@ -3,33 +3,7 @@
    "best_of_sequences": null,
    "finish_reason": "length",
    "generated_tokens": 10,
-    "prefill": [
+    "prefill": [],
-      {
-        "id": 1,
-        "logprob": null,
-        "text": "<s>"
-      },
-      {
-        "id": 338,
-        "logprob": -9.0859375,
-        "text": "is"
-      },
-      {
-        "id": 21784,
-        "logprob": -10.90625,
-        "text": "Deep"
-      },
-      {
-        "id": 29257,
-        "logprob": -2.65625,
-        "text": "Learning"
-      },
-      {
-        "id": 29973,
-        "logprob": -4.8085938,
-        "text": "?"
-      }
-    ],
    "seed": 0,
    "tokens": [
      {

--- a/integration-tests/models/__snapshots__/test_flash_awq/test_flash_llama_awq_load.json
+++ b/integration-tests/models/__snapshots__/test_flash_awq/test_flash_llama_awq_load.json
@@ -4,38 +4,7 @@
      "best_of_sequences": null,
      "finish_reason": "length",
      "generated_tokens": 10,
-      "prefill": [
+      "prefill": [],
-        {
-          "id": 1,
-          "logprob": null,
-          "text": "<s>"
-        },
-        {
-          "id": 1724,
-          "logprob": -7.703125,
-          "text": "What"
-        },
-        {
-          "id": 338,
-          "logprob": -1.4765625,
-          "text": "is"
-        },
-        {
-          "id": 21784,
-          "logprob": -9.390625,
-          "text": "Deep"
-        },
-        {
-          "id": 29257,
-          "logprob": -1.8652344,
-          "text": "Learning"
-        },
-        {
-          "id": 29973,
-          "logprob": -0.7548828,
-          "text": "?"
-        }
-      ],
      "seed": null,
      "tokens": [
        {
@@ -108,38 +77,7 @@
      "best_of_sequences": null,
      "finish_reason": "length",
      "generated_tokens": 10,
-      "prefill": [
+      "prefill": [],
-        {
-          "id": 1,
-          "logprob": null,
-          "text": "<s>"
-        },
-        {
-          "id": 1724,
-          "logprob": -7.703125,
-          "text": "What"
-        },
-        {
-          "id": 338,
-          "logprob": -1.4765625,
-          "text": "is"
-        },
-        {
-          "id": 21784,
-          "logprob": -9.390625,
-          "text": "Deep"
-        },
-        {
-          "id": 29257,
-          "logprob": -1.8583984,
-          "text": "Learning"
-        },
-        {
-          "id": 29973,
-          "logprob": -0.7548828,
-          "text": "?"
-        }
-      ],
      "seed": null,
      "tokens": [
        {
@@ -212,38 +150,7 @@
      "best_of_sequences": null,
      "finish_reason": "length",
      "generated_tokens": 10,
-      "prefill": [
+      "prefill": [],
-        {
-          "id": 1,
-          "logprob": null,
-          "text": "<s>"
-        },
-        {
-          "id": 1724,
-          "logprob": -7.703125,
-          "text": "What"
-        },
-        {
-          "id": 338,
-          "logprob": -1.4765625,
-          "text": "is"
-        },
-        {
-          "id": 21784,
-          "logprob": -9.390625,
-          "text": "Deep"
-        },
-        {
-          "id": 29257,
-          "logprob": -1.8652344,
-          "text": "Learning"
-        },
-        {
-          "id": 29973,
-          "logprob": -0.7548828,
-          "text": "?"
-        }
-      ],
      "seed": null,
      "tokens": [
        {
@@ -316,38 +223,7 @@
      "best_of_sequences": null,
      "finish_reason": "length",
      "generated_tokens": 10,
-      "prefill": [
+      "prefill": [],
-        {
-          "id": 1,
-          "logprob": null,
-          "text": "<s>"
-        },
-        {
-          "id": 1724,
-          "logprob": -7.703125,
-          "text": "What"
-        },
-        {
-          "id": 338,
-          "logprob": -1.4765625,
-          "text": "is"
-        },
-        {
-          "id": 21784,
-          "logprob": -9.390625,
-          "text": "Deep"
-        },
-        {
-          "id": 29257,
-          "logprob": -1.8652344,
-          "text": "Learning"
-        },
-        {
-          "id": 29973,
-          "logprob": -0.7548828,
-          "text": "?"
-        }
-      ],
      "seed": null,
      "tokens": [
        {