feat(server): Add exllama GPTQ CUDA kernel support #553 (#666)

Just trying to get the integration tests to pass. # What does this PR do?   Fixes # (issue) ## Before submitting - [ ] This PR fixes a typo or improves the docs (you can dismiss the other checks if that's the case). - [ ] Did you read the [contributor guideline](https://github.com/huggingface/transformers/blob/main/CONTRIBUTING.md#start-contributing-pull-requests), Pull Request section? - [ ] Was this discussed/approved via a Github issue or the [forum](https://discuss.huggingface.co/)? Please add a link to it if that's the case. - [ ] Did you make sure to update the documentation with your changes? Here are the [documentation guidelines](https://github.com/huggingface/transformers/tree/main/docs), and [here are tips on formatting docstrings](https://github.com/huggingface/transformers/tree/main/docs#writing-source-documentation ). - [ ] Did you write any new necessary tests? ## Who can review? Anyone in the community is free to review the PR once the tests have passed. Feel free to tag members/contributors who may be interested in your PR.  --------- Co-authored-by: Felix Marty <9808326+fxmarty@users.noreply.github.com>

feat(server): Add exllama GPTQ CUDA kernel support #553 (#666)
Just trying to get the integration tests to pass. # What does this PR do?   Fixes # (issue) ## Before submitting - [ ] This PR fixes a typo or improves the docs (you can dismiss the other checks if that's the case). - [ ] Did you read the [contributor guideline](https://github.com/huggingface/transformers/blob/main/CONTRIBUTING.md#start-contributing-pull-requests), Pull Request section? - [ ] Was this discussed/approved via a Github issue or the [forum](https://discuss.huggingface.co/)? Please add a link to it if that's the case. - [ ] Did you make sure to update the documentation with your changes? Here are the [documentation guidelines](https://github.com/huggingface/transformers/tree/main/docs), and [here are tips on formatting docstrings](https://github.com/huggingface/transformers/tree/main/docs#writing-source-documentation ). - [ ] Did you write any new necessary tests? ## Who can review? Anyone in the community is free to review the PR once the tests have passed. Feel free to tag members/contributors who may be interested in your PR.  --------- Co-authored-by: Felix Marty <9808326+fxmarty@users.noreply.github.com>
d5b5bc75 · Nicolas Patry · GitHub · bf94df3c · d5b5bc75 · d5b5bc75
Unverified Commit d5b5bc75 authored Jul 21, 2023 by Nicolas Patry Committed by GitHub Jul 21, 2023
20 changed files
--- a/Dockerfile
+++ b/Dockerfile
@@ -108,6 +108,17 @@ COPY server/Makefile-flash-att-v2 Makefile
 # Build specific version of flash attention v2
 RUN make build-flash-attention-v2
+# Build Transformers exllama kernels
+FROM kernel-builder as exllama-kernels-builder
+WORKDIR /usr/src
+COPY server/exllama_kernels/ .
+# Build specific version of transformers
+RUN TORCH_CUDA_ARCH_LIST="8.0;8.6+PTX" python setup.py build
 # Build Transformers CUDA kernels
 FROM kernel-builder as custom-kernels-builder
@@ -161,6 +172,8 @@ COPY --from=flash-att-v2-builder /usr/src/flash-attention-v2/build/lib.linux-x86
 # Copy build artifacts from custom kernels builder
 COPY --from=custom-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-39 /opt/conda/lib/python3.9/site-packages
+# Copy build artifacts from exllama kernels builder
+COPY --from=exllama-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-39 /opt/conda/lib/python3.9/site-packages
 # Copy builds artifacts from vllm builder
 COPY --from=vllm-builder /usr/src/vllm/build/lib.linux-x86_64-cpython-39 /opt/conda/lib/python3.9/site-packages

--- a/Makefile
+++ b/Makefile
@@ -56,3 +56,6 @@ run-bloom:
 run-bloom-quantize:
 	text-generation-launcher --model-id bigscience/bloom --num-shard 8 --quantize --port 8080
+clean:
+	rm -rf target aml
--- a/integration-tests/conftest.py
+++ b/integration-tests/conftest.py
@@ -230,15 +230,16 @@ def launcher(event_loop):
            shard_uds_path,
        ]
+        env = os.environ
        if num_shard is not None:
            args.extend(["--num-shard", str(num_shard)])
-        if quantize:
+        if quantize is not None:
            args.append("--quantize")
-            args.append("bitsandbytes")
+            args.append(quantize)
        if trust_remote_code:
            args.append("--trust-remote-code")
-        env = os.environ
        env["LOG_LEVEL"] = "info,text_generation_router=debug"
        if not use_flash_attention:
@@ -275,9 +276,9 @@ def launcher(event_loop):
        if num_shard is not None:
            args.extend(["--num-shard", str(num_shard)])
-        if quantize:
+        if quantize is not None:
            args.append("--quantize")
-            args.append("bitsandbytes")
+            args.append(quantize)
        if trust_remote_code:
            args.append("--trust-remote-code")

--- a/integration-tests/models/__snapshots__/test_flash_llama_gptq/test_flash_llama_gptq.json
+++ b/integration-tests/models/__snapshots__/test_flash_llama_gptq/test_flash_llama_gptq.json
+{
+  "details": {
+    "best_of_sequences": null,
+    "finish_reason": "length",
+    "generated_tokens": 10,
+    "prefill": [
+      {
+        "id": 1,
+        "logprob": null,
+        "text": "<s>"
+      },
+      {
+        "id": 4321,
+        "logprob": -9.59375,
+        "text": "Test"
+      },
+      {
+        "id": 2009,
+        "logprob": -9.6640625,
+        "text": "request"
+      }
+    ],
+    "seed": null,
+    "tokens": [
+      {
+        "id": 29918,
+        "logprob": -2.3867188,
+        "special": false,
+        "text": "_"
+      },
+      {
+        "id": 5338,
+        "logprob": -2.8183594,
+        "special": false,
+        "text": "uri"
+      },
+      {
+        "id": 13,
+        "logprob": -1.6367188,
+        "special": false,
+        "text": "\n"
+      },
+      {
+        "id": 3057,
+        "logprob": -1.0527344,
+        "special": false,
+        "text": "Test"
+      },
+      {
+        "id": 2009,
+        "logprob": -0.6542969,
+        "special": false,
+        "text": " request"
+      },
+      {
+        "id": 29918,
+        "logprob": -0.056121826,
+        "special": false,
+        "text": "_"
+      },
+      {
+        "id": 5338,
+        "logprob": -0.01600647,
+        "special": false,
+        "text": "uri"
+      },
+      {
+        "id": 13,
+        "logprob": -0.87939453,
+        "special": false,
+        "text": "\n"
+      },
+      {
+        "id": 3057,
+        "logprob": -0.7529297,
+        "special": false,
+        "text": "Test"
+      },
+      {
+        "id": 2009,
+        "logprob": -0.2980957,
+        "special": false,
+        "text": " request"
+      }
+    ]
+  },
+  "generated_text": "_uri\nTest request_uri\nTest request"
+}
--- a/integration-tests/models/__snapshots__/test_flash_llama_gptq/test_flash_llama_gptq_all_params.json
+++ b/integration-tests/models/__snapshots__/test_flash_llama_gptq/test_flash_llama_gptq_all_params.json
+{
+  "details": {
+    "best_of_sequences": null,
+    "finish_reason": "length",
+    "generated_tokens": 10,
+    "prefill": [
+      {
+        "id": 1,
+        "logprob": null,
+        "text": "<s>"
+      },
+      {
+        "id": 4321,
+        "logprob": -9.6015625,
+        "text": "Test"
+      },
+      {
+        "id": 2009,
+        "logprob": -9.6640625,
+        "text": "request"
+      }
+    ],
+    "seed": 0,
+    "tokens": [
+      {
+        "id": 29899,
+        "logprob": -1.1640625,
+        "special": false,
+        "text": "-"
+      },
+      {
+        "id": 1454,
+        "logprob": -0.07543945,
+        "special": false,
+        "text": "for"
+      },
+      {
+        "id": 29899,
+        "logprob": 0.0,
+        "special": false,
+        "text": "-"
+      },
+      {
+        "id": 9342,
+        "logprob": 0.0,
+        "special": false,
+        "text": "comment"
+      },
+      {
+        "id": 29901,
+        "logprob": 0.0,
+        "special": false,
+        "text": ":"
+      },
+      {
+        "id": 396,
+        "logprob": -0.2956543,
+        "special": false,
+        "text": " #"
+      },
+      {
+        "id": 29906,
+        "logprob": -0.52734375,
+        "special": false,
+        "text": "2"
+      },
+      {
+        "id": 29900,
+        "logprob": -0.6899414,
+        "special": false,
+        "text": "0"
+      },
+      {
+        "id": 29896,
+        "logprob": 0.0,
+        "special": false,
+        "text": "1"
+      },
+      {
+        "id": 29946,
+        "logprob": -1.5068359,
+        "special": false,
+        "text": "4"
+      }
+    ]
+  },
+  "generated_text": "Test request-for-comment: #2014"
+}
--- a/integration-tests/models/__snapshots__/test_flash_llama_gptq/test_flash_llama_gptq_load.json
+++ b/integration-tests/models/__snapshots__/test_flash_llama_gptq/test_flash_llama_gptq_load.json
+[
+  {
+    "details": {
+      "best_of_sequences": null,
+      "finish_reason": "length",
+      "generated_tokens": 10,
+      "prefill": [
+        {
+          "id": 1,
+          "logprob": null,
+          "text": "<s>"
+        },
+        {
+          "id": 4321,
+          "logprob": -9.6015625,
+          "text": "Test"
+        },
+        {
+          "id": 2009,
+          "logprob": -9.671875,
+          "text": "request"
+        }
+      ],
+      "seed": null,
+      "tokens": [
+        {
+          "id": 29918,
+          "logprob": -2.3828125,
+          "special": false,
+          "text": "_"
+        },
+        {
+          "id": 5338,
+          "logprob": -2.8105469,
+          "special": false,
+          "text": "uri"
+        },
+        {
+          "id": 13,
+          "logprob": -1.6396484,
+          "special": false,
+          "text": "\n"
+        },
+        {
+          "id": 3057,
+          "logprob": -1.0546875,
+          "special": false,
+          "text": "Test"
+        },
+        {
+          "id": 2009,
+          "logprob": -0.6513672,
+          "special": false,
+          "text": " request"
+        },
+        {
+          "id": 29918,
+          "logprob": -0.056365967,
+          "special": false,
+          "text": "_"
+        },
+        {
+          "id": 5338,
+          "logprob": -0.016082764,
+          "special": false,
+          "text": "uri"
+        },
+        {
+          "id": 13,
+          "logprob": -0.87841797,
+          "special": false,
+          "text": "\n"
+        },
+        {
+          "id": 3057,
+          "logprob": -0.7548828,
+          "special": false,
+          "text": "Test"
+        },
+        {
+          "id": 2009,
+          "logprob": -0.29711914,
+          "special": false,
+          "text": " request"
+        }
+      ]
+    },
+    "generated_text": "_uri\nTest request_uri\nTest request"
+  },
+  {
+    "details": {
+      "best_of_sequences": null,
+      "finish_reason": "length",
+      "generated_tokens": 10,
+      "prefill": [
+        {
+          "id": 1,
+          "logprob": null,
+          "text": "<s>"
+        },
+        {
+          "id": 4321,
+          "logprob": -9.6015625,
+          "text": "Test"
+        },
+        {
+          "id": 2009,
+          "logprob": -9.6640625,
+          "text": "request"
+        }
+      ],
+      "seed": null,
+      "tokens": [
+        {
+          "id": 29918,
+          "logprob": -2.3828125,
+          "special": false,
+          "text": "_"
+        },
+        {
+          "id": 5338,
+          "logprob": -2.828125,
+          "special": false,
+          "text": "uri"
+        },
+        {
+          "id": 13,
+          "logprob": -1.6386719,
+          "special": false,
+          "text": "\n"
+        },
+        {
+          "id": 3057,
+          "logprob": -1.0527344,
+          "special": false,
+          "text": "Test"
+        },
+        {
+          "id": 2009,
+          "logprob": -0.6542969,
+          "special": false,
+          "text": " request"
+        },
+        {
+          "id": 29918,
+          "logprob": -0.055877686,
+          "special": false,
+          "text": "_"
+        },
+        {
+          "id": 5338,
+          "logprob": -0.016021729,
+          "special": false,
+          "text": "uri"
+        },
+        {
+          "id": 13,
+          "logprob": -0.8769531,
+          "special": false,
+          "text": "\n"
+        },
+        {
+          "id": 3057,
+          "logprob": -0.7583008,
+          "special": false,
+          "text": "Test"
+        },
+        {
+          "id": 2009,
+          "logprob": -0.29833984,
+          "special": false,
+          "text": " request"
+        }
+      ]
+    },
+    "generated_text": "_uri\nTest request_uri\nTest request"
+  },
+  {
+    "details": {
+      "best_of_sequences": null,
+      "finish_reason": "length",
+      "generated_tokens": 10,
+      "prefill": [
+        {
+          "id": 1,
+          "logprob": null,
+          "text": "<s>"
+        },
+        {
+          "id": 4321,
+          "logprob": -9.6015625,
+          "text": "Test"
+        },
+        {
+          "id": 2009,
+          "logprob": -9.671875,
+          "text": "request"
+        }
+      ],
+      "seed": null,
+      "tokens": [
+        {
+          "id": 29918,
+          "logprob": -2.3847656,
+          "special": false,
+          "text": "_"
+        },
+        {
+          "id": 5338,
+          "logprob": -2.8144531,
+          "special": false,
+          "text": "uri"
+        },
+        {
+          "id": 13,
+          "logprob": -1.6396484,
+          "special": false,
+          "text": "\n"
+        },
+        {
+          "id": 3057,
+          "logprob": -1.0527344,
+          "special": false,
+          "text": "Test"
+        },
+        {
+          "id": 2009,
+          "logprob": -0.65478516,
+          "special": false,
+          "text": " request"
+        },
+        {
+          "id": 29918,
+          "logprob": -0.056243896,
+          "special": false,
+          "text": "_"
+        },
+        {
+          "id": 5338,
+          "logprob": -0.016143799,
+          "special": false,
+          "text": "uri"
+        },
+        {
+          "id": 13,
+          "logprob": -0.8808594,
+          "special": false,
+          "text": "\n"
+        },
+        {
+          "id": 3057,
+          "logprob": -0.75341797,
+          "special": false,
+          "text": "Test"
+        },
+        {
+          "id": 2009,
+          "logprob": -0.2956543,
+          "special": false,
+          "text": " request"
+        }
+      ]
+    },
+    "generated_text": "_uri\nTest request_uri\nTest request"
+  },
+  {
+    "details": {
+      "best_of_sequences": null,
+      "finish_reason": "length",
+      "generated_tokens": 10,
+      "prefill": [
+        {
+          "id": 1,
+          "logprob": null,
+          "text": "<s>"
+        },
+        {
+          "id": 4321,
+          "logprob": -9.6015625,
+          "text": "Test"
+        },
+        {
+          "id": 2009,
+          "logprob": -9.6640625,
+          "text": "request"
+        }
+      ],
+      "seed": null,
+      "tokens": [
+        {
+          "id": 29918,
+          "logprob": -2.3769531,
+          "special": false,
+          "text": "_"
+        },
+        {
+          "id": 5338,
+          "logprob": -2.8183594,
+          "special": false,
+          "text": "uri"
+        },
+        {
+          "id": 13,
+          "logprob": -1.6396484,
+          "special": false,
+          "text": "\n"
+        },
+        {
+          "id": 3057,
+          "logprob": -1.0546875,
+          "special": false,
+          "text": "Test"
+        },
+        {
+          "id": 2009,
+          "logprob": -0.65478516,
+          "special": false,
+          "text": " request"
+        },
+        {
+          "id": 29918,
+          "logprob": -0.05557251,
+          "special": false,
+          "text": "_"
+        },
+        {
+          "id": 5338,
+          "logprob": -0.01612854,
+          "special": false,
+          "text": "uri"
+        },
+        {
+          "id": 13,
+          "logprob": -0.8730469,
+          "special": false,
+          "text": "\n"
+        },
+        {
+          "id": 3057,
+          "logprob": -0.7519531,
+          "special": false,
+          "text": "Test"
+        },
+        {
+          "id": 2009,
+          "logprob": -0.29785156,
+          "special": false,
+          "text": " request"
+        }
+      ]
+    },
+    "generated_text": "_uri\nTest request_uri\nTest request"
+  }
+]
--- a/integration-tests/models/__snapshots__/test_flash_starcoder_gptq/test_flash_starcoder_gptq.json
+++ b/integration-tests/models/__snapshots__/test_flash_starcoder_gptq/test_flash_starcoder_gptq.json
+{
+  "generated_text": "\n    return sum(L) / len(L)\n\n\ndef geometric_mean(L",
+  "details": {
+    "best_of_sequences": null,
+    "finish_reason": "length",
+    "generated_tokens": 20,
+    "seed": null,
+    "prefill": [
+      {
+        "id": 589,
+        "text": "def",
+        "logprob": null
+      },
+      {
+        "id": 3226,
+        "text": " ge",
+        "logprob": -9.0234375
+      },
+      {
+        "id": 21017,
+        "text": "ometric",
+        "logprob": -9.0859375
+      },
+      {
+        "id": 81,
+        "text": "_",
+        "logprob": -0.25878906
+      },
+      {
+        "id": 6009,
+        "text": "mean",
+        "logprob": -2.2109375
+      },
+      {
+        "id": 26,
+        "text": "(",
+        "logprob": -0.30371094
+      },
+      {
+        "id": 62,
+        "text": "L",
+        "logprob": -5.6054688
+      },
+      {
+        "id": 44,
+        "text": ":",
+        "logprob": -3.0722656
+      },
+      {
+        "id": 1682,
+        "text": " List",
+        "logprob": -0.6879883
+      },
+      {
+        "id": 77,
+        "text": "[",
+        "logprob": -0.38500977
+      },
+      {
+        "id": 1808,
+        "text": "float",
+        "logprob": -0.984375
+      },
+      {
+        "id": 10794,
+        "text": "]):",
+        "logprob": -2.5351562
+      }
+    ],
+    "tokens": [
+      {
+        "id": 284,
+        "text": "\n   ",
+        "logprob": -1.1738281,
+        "special": false
+      },
+      {
+        "id": 442,
+        "text": " return",
+        "logprob": -0.95947266,
+        "special": false
+      },
+      {
+        "id": 3632,
+        "text": " sum",
+        "logprob": -1.4199219,
+        "special": false
+      },
+      {
+        "id": 26,
+        "text": "(",
+        "logprob": -0.085876465,
+        "special": false
+      },
+      {
+        "id": 62,
+        "text": "L",
+        "logprob": -0.09875488,
+        "special": false
+      },
+      {
+        "id": 27,
+        "text": ")",
+        "logprob": -0.30517578,
+        "special": false
+      },
+      {
+        "id": 517,
+        "text": " /",
+        "logprob": -0.42089844,
+        "special": false
+      },
+      {
+        "id": 2069,
+        "text": " len",
+        "logprob": -0.042053223,
+        "special": false
+      },
+      {
+        "id": 26,
+        "text": "(",
+        "logprob": -0.0011806488,
+        "special": false
+      },
+      {
+        "id": 62,
+        "text": "L",
+        "logprob": -0.0005259514,
+        "special": false
+      },
+      {
+        "id": 27,
+        "text": ")",
+        "logprob": -0.0017633438,
+        "special": false
+      },
+      {
+        "id": 478,
+        "text": "\n\n",
+        "logprob": -0.69189453,
+        "special": false
+      },
+      {
+        "id": 203,
+        "text": "\n",
+        "logprob": -0.041870117,
+        "special": false
+      },
+      {
+        "id": 589,
+        "text": "def",
+        "logprob": -0.27856445,
+        "special": false
+      },
+      {
+        "id": 3226,
+        "text": " ge",
+        "logprob": -1.7255859,
+        "special": false
+      },
+      {
+        "id": 21017,
+        "text": "ometric",
+        "logprob": -0.011291504,
+        "special": false
+      },
+      {
+        "id": 81,
+        "text": "_",
+        "logprob": -0.008430481,
+        "special": false
+      },
+      {
+        "id": 6009,
+        "text": "mean",
+        "logprob": -0.025787354,
+        "special": false
+      },
+      {
+        "id": 26,
+        "text": "(",
+        "logprob": -0.073913574,
+        "special": false
+      },
+      {
+        "id": 62,
+        "text": "L",
+        "logprob": -0.09967041,
+        "special": false
+      }
+    ]
+  }
+}
--- a/integration-tests/models/__snapshots__/test_flash_starcoder_gptq/test_flash_starcoder_gptq_default_params.json
+++ b/integration-tests/models/__snapshots__/test_flash_starcoder_gptq/test_flash_starcoder_gptq_default_params.json
+{
+  "details": {
+    "best_of_sequences": null,
+    "finish_reason": "length",
+    "generated_tokens": 20,
+    "prefill": [
+      {
+        "id": 589,
+        "logprob": null,
+        "text": "def"
+      },
+      {
+        "id": 3226,
+        "logprob": -9.0234375,
+        "text": " ge"
+      },
+      {
+        "id": 21017,
+        "logprob": -9.09375,
+        "text": "ometric"
+      },
+      {
+        "id": 81,
+        "logprob": -0.25976562,
+        "text": "_"
+      },
+      {
+        "id": 6009,
+        "logprob": -2.2148438,
+        "text": "mean"
+      },
+      {
+        "id": 26,
+        "logprob": -0.3010254,
+        "text": "("
+      },
+      {
+        "id": 62,
+        "logprob": -5.6757812,
+        "text": "L"
+      },
+      {
+        "id": 44,
+        "logprob": -3.0898438,
+        "text": ":"
+      },
+      {
+        "id": 1682,
+        "logprob": -0.6791992,
+        "text": " List"
+      },
+      {
+        "id": 77,
+        "logprob": -0.38891602,
+        "text": "["
+      },
+      {
+        "id": 1808,
+        "logprob": -0.92041016,
+        "text": "float"
+      },
+      {
+        "id": 10794,
+        "logprob": -2.5390625,
+        "text": "]):"
+      }
+    ],
+    "seed": 0,
+    "tokens": [
+      {
+        "id": 284,
+        "logprob": 0.0,
+        "special": false,
+        "text": "\n   "
+      },
+      {
+        "id": 442,
+        "logprob": 0.0,
+        "special": false,
+        "text": " return"
+      },
+      {
+        "id": 11665,
+        "logprob": -1.6005859,
+        "special": false,
+        "text": " reduce"
+      },
+      {
+        "id": 26,
+        "logprob": 0.0,
+        "special": false,
+        "text": "("
+      },
+      {
+        "id": 5962,
+        "logprob": 0.0,
+        "special": false,
+        "text": "lambda"
+      },
+      {
+        "id": 816,
+        "logprob": 0.0,
+        "special": false,
+        "text": " x"
+      },
+      {
+        "id": 30,
+        "logprob": 0.0,
+        "special": false,
+        "text": ","
+      },
+      {
+        "id": 533,
+        "logprob": 0.0,
+        "special": false,
+        "text": " y"
+      },
+      {
+        "id": 44,
+        "logprob": 0.0,
+        "special": false,
+        "text": ":"
+      },
+      {
+        "id": 816,
+        "logprob": 0.0,
+        "special": false,
+        "text": " x"
+      },
+      {
+        "id": 319,
+        "logprob": 0.0,
+        "special": false,
+        "text": " *"
+      },
+      {
+        "id": 533,
+        "logprob": 0.0,
+        "special": false,
+        "text": " y"
+      },
+      {
+        "id": 30,
+        "logprob": 0.0,
+        "special": false,
+        "text": ","
+      },
+      {
+        "id": 498,
+        "logprob": 0.0,
+        "special": false,
+        "text": " L"
+      },
+      {
+        "id": 27,
+        "logprob": 0.0,
+        "special": false,
+        "text": ")"
+      },
+      {
+        "id": 203,
+        "logprob": -0.11968994,
+        "special": false,
+        "text": "\n"
+      },
+      {
+        "id": 203,
+        "logprob": 0.0,
+        "special": false,
+        "text": "\n"
+      },
+      {
+        "id": 589,
+        "logprob": 0.0,
+        "special": false,
+        "text": "def"
+      },
+      {
+        "id": 3226,
+        "logprob": 0.0,
+        "special": false,
+        "text": " ge"
+      },
+      {
+        "id": 21017,
+        "logprob": 0.0,
+        "special": false,
+        "text": "ometric"
+      }
+    ]
+  },
+  "generated_text": "\n    return reduce(lambda x, y: x * y, L)\n\ndef geometric"
+}
--- a/integration-tests/models/__snapshots__/test_flash_starcoder_gptq/test_flash_starcoder_gptq_load.json
+++ b/integration-tests/models/__snapshots__/test_flash_starcoder_gptq/test_flash_starcoder_gptq_load.json
+[
+  {
+    "details": {
+      "best_of_sequences": null,
+      "finish_reason": "length",
+      "generated_tokens": 10,
+      "prefill": [
+        {
+          "id": 589,
+          "logprob": null,
+          "text": "def"
+        },
+        {
+          "id": 3226,
+          "logprob": -9.0234375,
+          "text": " ge"
+        },
+        {
+          "id": 21017,
+          "logprob": -9.0859375,
+          "text": "ometric"
+        },
+        {
+          "id": 81,
+          "logprob": -0.25927734,
+          "text": "_"
+        },
+        {
+          "id": 6009,
+          "logprob": -2.25,
+          "text": "mean"
+        },
+        {
+          "id": 26,
+          "logprob": -0.30126953,
+          "text": "("
+        },
+        {
+          "id": 62,
+          "logprob": -5.7539062,
+          "text": "L"
+        },
+        {
+          "id": 44,
+          "logprob": -3.0878906,
+          "text": ":"
+        },
+        {
+          "id": 1682,
+          "logprob": -0.6845703,
+          "text": " List"
+        },
+        {
+          "id": 77,
+          "logprob": -0.3918457,
+          "text": "["
+        },
+        {
+          "id": 1808,
+          "logprob": -0.8798828,
+          "text": "float"
+        },
+        {
+          "id": 10794,
+          "logprob": -2.4980469,
+          "text": "]):"
+        }
+      ],
+      "seed": null,
+      "tokens": [
+        {
+          "id": 284,
+          "logprob": -1.1533203,
+          "special": false,
+          "text": "\n   "
+        },
+        {
+          "id": 442,
+          "logprob": -0.91796875,
+          "special": false,
+          "text": " return"
+        },
+        {
+          "id": 3632,
+          "logprob": -1.3291016,
+          "special": false,
+          "text": " sum"
+        },
+        {
+          "id": 26,
+          "logprob": -0.08062744,
+          "special": false,
+          "text": "("
+        },
+        {
+          "id": 62,
+          "logprob": -0.097717285,
+          "special": false,
+          "text": "L"
+        },
+        {
+          "id": 27,
+          "logprob": -0.29003906,
+          "special": false,
+          "text": ")"
+        },
+        {
+          "id": 517,
+          "logprob": -0.34958984,
+          "special": false,
+          "text": " /"
+        },
+        {
+          "id": 2069,
+          "logprob": -0.03829956,
+          "special": false,
+          "text": " len"
+        },
+        {
+          "id": 26,
+          "logprob": -0.0011987686,
+          "special": false,
+          "text": "("
+        },
+        {
+          "id": 62,
+          "logprob": -0.00050878525,
+          "special": false,
+          "text": "L"
+        }
+      ]
+    },
+    "generated_text": "\n    return sum(L) / len(L"
+  },
+  {
+    "details": {
+      "best_of_sequences": null,
+      "finish_reason": "length",
+      "generated_tokens": 10,
+      "prefill": [
+        {
+          "id": 589,
+          "logprob": null,
+          "text": "def"
+        },
+        {
+          "id": 3226,
+          "logprob": -9.0234375,
+          "text": " ge"
+        },
+        {
+          "id": 21017,
+          "logprob": -9.0859375,
+          "text": "ometric"
+        },
+        {
+          "id": 81,
+          "logprob": -0.25878906,
+          "text": "_"
+        },
+        {
+          "id": 6009,
+          "logprob": -2.2109375,
+          "text": "mean"
+        },
+        {
+          "id": 26,
+          "logprob": -0.30371094,
+          "text": "("
+        },
+        {
+          "id": 62,
+          "logprob": -5.6054688,
+          "text": "L"
+        },
+        {
+          "id": 44,
+          "logprob": -3.0722656,
+          "text": ":"
+        },
+        {
+          "id": 1682,
+          "logprob": -0.6879883,
+          "text": " List"
+        },
+        {
+          "id": 77,
+          "logprob": -0.38500977,
+          "text": "["
+        },
+        {
+          "id": 1808,
+          "logprob": -0.984375,
+          "text": "float"
+        },
+        {
+          "id": 10794,
+          "logprob": -2.5351562,
+          "text": "]):"
+        }
+      ],
+      "seed": null,
+      "tokens": [
+        {
+          "id": 284,
+          "logprob": -1.1738281,
+          "special": false,
+          "text": "\n   "
+        },
+        {
+          "id": 442,
+          "logprob": -0.9584961,
+          "special": false,
+          "text": " return"
+        },
+        {
+          "id": 3632,
+          "logprob": -1.4169922,
+          "special": false,
+          "text": " sum"
+        },
+        {
+          "id": 26,
+          "logprob": -0.085876465,
+          "special": false,
+          "text": "("
+        },
+        {
+          "id": 62,
+          "logprob": -0.0982666,
+          "special": false,
+          "text": "L"
+        },
+        {
+          "id": 27,
+          "logprob": -0.3022461,
+          "special": false,
+          "text": ")"
+        },
+        {
+          "id": 517,
+          "logprob": -0.40504883,
+          "special": false,
+          "text": " /"
+        },
+        {
+          "id": 2069,
+          "logprob": -0.041656494,
+          "special": false,
+          "text": " len"
+        },
+        {
+          "id": 26,
+          "logprob": -0.0011844635,
+          "special": false,
+          "text": "("
+        },
+        {
+          "id": 62,
+          "logprob": -0.0005264282,
+          "special": false,
+          "text": "L"
+        }
+      ]
+    },
+    "generated_text": "\n    return sum(L) / len(L"
+  },
+  {
+    "details": {
+      "best_of_sequences": null,
+      "finish_reason": "length",
+      "generated_tokens": 10,
+      "prefill": [
+        {
+          "id": 589,
+          "logprob": null,
+          "text": "def"
+        },
+        {
+          "id": 3226,
+          "logprob": -9.0234375,
+          "text": " ge"
+        },
+        {
+          "id": 21017,
+          "logprob": -9.0859375,
+          "text": "ometric"
+        },
+        {
+          "id": 81,
+          "logprob": -0.25927734,
+          "text": "_"
+        },
+        {
+          "id": 6009,
+          "logprob": -2.25,
+          "text": "mean"
+        },
+        {
+          "id": 26,
+          "logprob": -0.30126953,
+          "text": "("
+        },
+        {
+          "id": 62,
+          "logprob": -5.7539062,
+          "text": "L"
+        },
+        {
+          "id": 44,
+          "logprob": -3.0878906,
+          "text": ":"
+        },
+        {
+          "id": 1682,
+          "logprob": -0.6845703,
+          "text": " List"
+        },
+        {
+          "id": 77,
+          "logprob": -0.3918457,
+          "text": "["
+        },
+        {
+          "id": 1808,
+          "logprob": -0.8798828,
+          "text": "float"
+        },
+        {
+          "id": 10794,
+          "logprob": -2.4980469,
+          "text": "]):"
+        }
+      ],
+      "seed": null,
+      "tokens": [
+        {
+          "id": 284,
+          "logprob": -1.1533203,
+          "special": false,
+          "text": "\n   "
+        },
+        {
+          "id": 442,
+          "logprob": -0.9165039,
+          "special": false,
+          "text": " return"
+        },
+        {
+          "id": 3632,
+          "logprob": -1.328125,
+          "special": false,
+          "text": " sum"
+        },
+        {
+          "id": 26,
+          "logprob": -0.07946777,
+          "special": false,
+          "text": "("
+        },
+        {
+          "id": 62,
+          "logprob": -0.09820557,
+          "special": false,
+          "text": "L"
+        },
+        {
+          "id": 27,
+          "logprob": -0.28930664,
+          "special": false,
+          "text": ")"
+        },
+        {
+          "id": 517,
+          "logprob": -0.34592773,
+          "special": false,
+          "text": " /"
+        },
+        {
+          "id": 2069,
+          "logprob": -0.038330078,
+          "special": false,
+          "text": " len"
+        },
+        {
+          "id": 26,
+          "logprob": -0.0011940002,
+          "special": false,
+          "text": "("
+        },
+        {
+          "id": 62,
+          "logprob": -0.00050878525,
+          "special": false,
+          "text": "L"
+        }
+      ]
+    },
+    "generated_text": "\n    return sum(L) / len(L"
+  },
+  {
+    "details": {
+      "best_of_sequences": null,
+      "finish_reason": "length",
+      "generated_tokens": 10,
+      "prefill": [
+        {
+          "id": 589,
+          "logprob": null,
+          "text": "def"
+        },
+        {
+          "id": 3226,
+          "logprob": -9.0234375,
+          "text": " ge"
+        },
+        {
+          "id": 21017,
+          "logprob": -9.0859375,
+          "text": "ometric"
+        },
+        {
+          "id": 81,
+          "logprob": -0.25927734,
+          "text": "_"
+        },
+        {
+          "id": 6009,
+          "logprob": -2.25,
+          "text": "mean"
+        },
+        {
+          "id": 26,
+          "logprob": -0.30126953,
+          "text": "("
+        },
+        {
+          "id": 62,
+          "logprob": -5.7539062,
+          "text": "L"
+        },
+        {
+          "id": 44,
+          "logprob": -3.0878906,
+          "text": ":"
+        },
+        {
+          "id": 1682,
+          "logprob": -0.6845703,
+          "text": " List"
+        },
+        {
+          "id": 77,
+          "logprob": -0.3918457,
+          "text": "["
+        },
+        {
+          "id": 1808,
+          "logprob": -0.8798828,
+          "text": "float"
+        },
+        {
+          "id": 10794,
+          "logprob": -2.4980469,
+          "text": "]):"
+        }
+      ],
+      "seed": null,
+      "tokens": [
+        {
+          "id": 284,
+          "logprob": -1.1533203,
+          "special": false,
+          "text": "\n   "
+        },
+        {
+          "id": 442,
+          "logprob": -0.91259766,
+          "special": false,
+          "text": " return"
+        },
+        {
+          "id": 3632,
+          "logprob": -1.3251953,
+          "special": false,
+          "text": " sum"
+        },
+        {
+          "id": 26,
+          "logprob": -0.08062744,
+          "special": false,
+          "text": "("
+        },
+        {
+          "id": 62,
+          "logprob": -0.09906006,
+          "special": false,
+          "text": "L"
+        },
+        {
+          "id": 27,
+          "logprob": -0.28979492,
+          "special": false,
+          "text": ")"
+        },
+        {
+          "id": 517,
+          "logprob": -0.35958984,
+          "special": false,
+          "text": " /"
+        },
+        {
+          "id": 2069,
+          "logprob": -0.038604736,
+          "special": false,
+          "text": " len"
+        },
+        {
+          "id": 26,
+          "logprob": -0.0011901855,
+          "special": false,
+          "text": "("
+        },
+        {
+          "id": 62,
+          "logprob": -0.0005078316,
+          "special": false,
+          "text": "L"
+        }
+      ]
+    },
+    "generated_text": "\n    return sum(L) / len(L"
+  }
+]
--- a/integration-tests/models/test_flash_llama_gptq.py
+++ b/integration-tests/models/test_flash_llama_gptq.py
+import pytest
+@pytest.fixture(scope="module")
+def flash_llama_gptq_handle(launcher):
+    with launcher("huggingface/llama-7b-gptq", num_shard=2, quantize="gptq") as handle:
+        yield handle
+@pytest.fixture(scope="module")
+async def flash_llama_gptq(flash_llama_gptq_handle):
+    await flash_llama_gptq_handle.health(300)
+    return flash_llama_gptq_handle.client
+@pytest.mark.asyncio
+@pytest.mark.private
+async def test_flash_llama_gptq(flash_llama_gptq, response_snapshot):
+    response = await flash_llama_gptq.generate(
+        "Test request", max_new_tokens=10, decoder_input_details=True
+    )
+    assert response.details.generated_tokens == 10
+    assert response == response_snapshot
+@pytest.mark.asyncio
+@pytest.mark.private
+async def test_flash_llama_gptq_all_params(flash_llama_gptq, response_snapshot):
+    response = await flash_llama_gptq.generate(
+        "Test request",
+        max_new_tokens=10,
+        repetition_penalty=1.2,
+        return_full_text=True,
+        temperature=0.5,
+        top_p=0.9,
+        top_k=10,
+        truncate=5,
+        typical_p=0.9,
+        watermark=True,
+        decoder_input_details=True,
+        seed=0,
+    )
+    assert response.details.generated_tokens == 10
+    assert response == response_snapshot
+@pytest.mark.asyncio
+@pytest.mark.private
+async def test_flash_llama_gptq_load(flash_llama_gptq, generate_load, response_snapshot):
+    responses = await generate_load(flash_llama_gptq, "Test request", max_new_tokens=10, n=4)
+    assert len(responses) == 4
+    assert all([r.generated_text == responses[0].generated_text for r in responses])
+    assert responses == response_snapshot
--- a/integration-tests/models/test_flash_starcoder_gptq.py
+++ b/integration-tests/models/test_flash_starcoder_gptq.py
+import pytest
+@pytest.fixture(scope="module")
+def flash_starcoder_gptq_handle(launcher):
+    with launcher("Narsil/starcoder-gptq", num_shard=2, quantize="gptq") as handle:
+        yield handle
+@pytest.fixture(scope="module")
+async def flash_starcoder_gptq(flash_starcoder_gptq_handle):
+    await flash_starcoder_gptq_handle.health(300)
+    return flash_starcoder_gptq_handle.client
+@pytest.mark.asyncio
+@pytest.mark.private
+async def test_flash_starcoder_gptq(flash_starcoder_gptq, response_snapshot):
+    response = await flash_starcoder_gptq.generate(
+        "def geometric_mean(L: List[float]):", max_new_tokens=20, decoder_input_details=True,
+    )
+    assert response.details.generated_tokens == 20
+    assert response == response_snapshot
+@pytest.mark.asyncio
+@pytest.mark.private
+async def test_flash_starcoder_gptq_default_params(flash_starcoder_gptq, response_snapshot):
+    response = await flash_starcoder_gptq.generate(
+        "def geometric_mean(L: List[float]):",
+        max_new_tokens=20,
+        temperature=0.2,
+        top_p=0.95,
+        decoder_input_details=True,
+        seed=0,
+    )
+    assert response.details.generated_tokens == 20
+    assert response == response_snapshot
+@pytest.mark.asyncio
+@pytest.mark.private
+async def test_flash_starcoder_gptq_load(flash_starcoder_gptq, generate_load, response_snapshot):
+    responses = await generate_load(flash_starcoder_gptq, "def geometric_mean(L: List[float]):", max_new_tokens=10, n=4)
+    assert len(responses) == 4
+    assert all([r.generated_text == responses[0].generated_text for r in responses])
+    assert responses == response_snapshot
\ No newline at end of file
--- a/server/exllama_kernels/exllama_kernels/cuda_buffers.cu
+++ b/server/exllama_kernels/exllama_kernels/cuda_buffers.cu
+// Adapted from turboderp exllama: https://github.com/turboderp/exllama
+#define _cuda_buffers_cu
+#include "cuda_buffers.cuh"
+CudaBuffers* g_buffers[CUDA_MAX_DEVICES] = {NULL};
+// __constant__ half2 q4_table[16][256];
+// half2 q4_table_host[16][256];
+// bool q4_table_init = false;
+CudaBuffers::CudaBuffers
+(
+    int _device,
+    half* _temp_state,
+    half* _temp_dq
+) :
+    device(_device),
+    temp_state(_temp_state),
+    temp_dq(_temp_dq)
+{
+    cudaSetDevice(_device);
+    cudaStreamCreate(&alt_stream_1);
+    cudaStreamCreate(&alt_stream_2);
+    cudaStreamCreate(&alt_stream_3);
+    cudaEventCreate(&alt_stream_1_done);
+    cudaEventCreate(&alt_stream_2_done);
+    cudaEventCreate(&alt_stream_3_done);
+}
+CudaBuffers::~CudaBuffers()
+{
+    cudaStreamDestroy(alt_stream_1);
+    cudaStreamDestroy(alt_stream_2);
+    cudaStreamDestroy(alt_stream_3);
+    cudaEventDestroy(alt_stream_1_done);
+    cudaEventDestroy(alt_stream_2_done);
+    cudaEventDestroy(alt_stream_3_done);
+}
+CudaBuffers* get_buffers(const int device_index)
+{
+    return g_buffers[device_index];
+}
+void prepare_buffers_cuda
+(
+    int _device,
+    half* _temp_state,
+    half* _temp_dq
+)
+{
+    CudaBuffers* buffers = new CudaBuffers
+    (
+        _device,
+        _temp_state,
+        _temp_dq
+    );
+    g_buffers[_device] = buffers;
+}
+void cleanup_buffers_cuda()
+{
+    for (int i = 0; i < CUDA_MAX_DEVICES; i++)
+    {
+        if (!g_buffers[i]) continue;
+        delete g_buffers[i];
+        g_buffers[i] = NULL;
+    }
+}
--- a/server/exllama_kernels/exllama_kernels/cuda_buffers.cuh
+++ b/server/exllama_kernels/exllama_kernels/cuda_buffers.cuh
+// Adapted from turboderp exllama: https://github.com/turboderp/exllama
+#ifndef _cuda_buffers_cuh
+#define _cuda_buffers_cuh
+#include <cuda_runtime.h>
+#include <cuda_fp16.h>
+#include <cstdint>
+#include <cstdio>
+const int CUDA_MAX_DEVICES = 16;
+// #ifndef _cuda_buffers_cu
+// extern __constant__ half2 q4_table[16][256];
+// #endif
+class CudaBuffers
+{
+public:
+    int device;
+    half* temp_state;           // [max_hidden_rows * intermediate_size]
+    half* temp_dq;              // size of largest quant tensor * 8
+    cudaStream_t alt_stream_1;
+    cudaStream_t alt_stream_2;
+    cudaStream_t alt_stream_3;
+    cudaEvent_t alt_stream_1_done;
+    cudaEvent_t alt_stream_2_done;
+    cudaEvent_t alt_stream_3_done;
+    CudaBuffers
+    (
+        int _device,
+        half* _temp_state,
+        half* _temp_dq
+    );
+    ~CudaBuffers();
+};
+CudaBuffers* get_buffers(const int device_index);
+void prepare_buffers_cuda
+(
+    int _device,
+    half* _temp_state,
+    half* _temp_dq
+);
+void cleanup_buffers_cuda();
+#endif
--- a/server/exllama_kernels/exllama_kernels/cuda_compat.cuh
+++ b/server/exllama_kernels/exllama_kernels/cuda_compat.cuh
+// Adapted from turboderp exllama: https://github.com/turboderp/exllama
+#ifndef _cuda_compat_cuh
+#define _cuda_compat_cuh
+// atomicAdd for half types, to support CC < 7.x
+__device__ __forceinline__ void atomicAdd_half(half* address, half val)
+{
+    unsigned int * address_as_ui = (unsigned int *) ((char *)address - ((size_t)address & 2));
+    unsigned int old = *address_as_ui;
+    unsigned int assumed;
+    do
+    {
+        assumed = old;
+        __half_raw hsum;
+        hsum.x = (size_t)address & 2 ? (old >> 16) : (old & 0xffff);
+        half tmpres = __hadd(hsum, val);
+        hsum = __half_raw(tmpres);
+        old = (size_t)address & 2 ? (old & 0xffff) | (hsum.x << 16) : (old & 0xffff0000) | hsum.x;
+        old = atomicCAS(address_as_ui, assumed, old);
+    }
+    while (assumed != old);
+}
+// atomicAdd for half2 types
+__device__ __forceinline__ void atomicAdd_half2(half2* address, half2 val)
+{
+    unsigned int* address_as_ui = (unsigned int*)address;
+    unsigned int old = *address_as_ui;
+    unsigned int assumed;
+    do
+    {
+        assumed = old;
+        half2 old_val = *((half2*)&old);
+        half2 new_val = __hadd2(old_val, val);
+        old = atomicCAS(address_as_ui, assumed, *((unsigned int*)&new_val));
+    }
+    while (assumed != old);
+}
+//
+#if defined(__CUDA_ARCH__)
+#if __CUDA_ARCH__ < 700
+__device__ __forceinline__ void atomicAdd(half* address, half val) { atomicAdd_half(address, val); }
+#if __CUDA_ARCH__ < 600
+__device__ __forceinline__ void atomicAdd(half2* address, half2 val) { atomicAdd_half2(address, val); }
+#endif
+#endif
+#endif
+#endif
--- a/server/exllama_kernels/exllama_kernels/cuda_func/column_remap.cu
+++ b/server/exllama_kernels/exllama_kernels/cuda_func/column_remap.cu
+// Adapted from turboderp exllama: https://github.com/turboderp/exllama
+#include "column_remap.cuh"
+#include "../util.cuh"
+const int SHUF_BLOCKSIZE_X = 256;
+const int SHUF_BLOCKSIZE_Y = 16;
+__global__ void column_remap_kernel
+(
+    const half* __restrict__ x,
+    half* __restrict__ x_new,
+    const int x_width,
+    const int x_height,
+    const uint32_t* x_map
+)
+{
+    int x_column = SHUF_BLOCKSIZE_X * blockIdx.x + threadIdx.x;
+    int x_row = SHUF_BLOCKSIZE_Y * blockIdx.y;
+    int x_stride = x_width;
+    int x_idx = x_row * x_stride + x_column;
+    int x_row_end = min(x_row + SHUF_BLOCKSIZE_Y, x_height);
+    int x_idx_end = x_row_end * x_stride + x_column;
+    int s_column = x_map[x_column];
+    int s_idx = x_row * x_stride + s_column;
+    while (x_idx < x_idx_end)
+    {
+        x_new[x_idx] = x[s_idx];
+        x_idx += x_stride;
+        s_idx += x_stride;
+    }
+}
+// Remap columns in x to correspond to sequential group index before matmul
+//
+// perform x -> seq_x such that seq_x @ seq_w == x @ w
+void column_remap_cuda
+(
+    const half* x,
+    half* x_new,
+    const int x_height,
+    const int x_width,
+    const uint32_t* x_map
+)
+{
+    dim3 threads(SHUF_BLOCKSIZE_X, 1, 1);
+    dim3 blocks
+    (
+        (x_width + SHUF_BLOCKSIZE_X - 1) / SHUF_BLOCKSIZE_X,
+        (x_height + SHUF_BLOCKSIZE_Y - 1) / SHUF_BLOCKSIZE_Y,
+        1
+    );
+    column_remap_kernel<<<blocks, threads>>>(x, x_new, x_width, x_height, x_map);
+}
--- a/server/exllama_kernels/exllama_kernels/cuda_func/column_remap.cuh
+++ b/server/exllama_kernels/exllama_kernels/cuda_func/column_remap.cuh
+// Adapted from turboderp exllama: https://github.com/turboderp/exllama
+#ifndef _column_remap_cuh
+#define _column_remap_cuh
+#include <cuda_runtime.h>
+#include <cuda_fp16.h>
+#include <cstdint>
+void column_remap_cuda
+(
+    const half* x,
+    half* x_new,
+    const int x_height,
+    const int x_width,
+    const uint32_t* x_map
+);
+#endif
\ No newline at end of file
--- a/server/exllama_kernels/exllama_kernels/cuda_func/q4_matmul.cu
+++ b/server/exllama_kernels/exllama_kernels/cuda_func/q4_matmul.cu
+#include "q4_matmul.cuh"
+#include "column_remap.cuh"
+#include "../util.cuh"
+#include "../matrix.cuh"
+#include "../cuda_compat.cuh"
+#include "../cuda_buffers.cuh"
+const int THREADS_X = 32;       // Block size and thread count along columns in w and out
+const int THREADS_Y = 1;        // Block size and thread count along rows in x and out
+typedef void (*fp_q4_matmul_kernel)
+(
+    const half*,
+    const uint32_t*,
+    half*,
+    const half*,
+    const uint32_t*,
+    const int,
+    const int,
+    const int,
+    const int,
+    const int,
+    const uint32_t*,
+    bool
+);
+template<bool use_half2, bool use_groupsize, bool use_x_map>
+__global__ void q4_matmul_kernel
+(
+    const half* __restrict__ x,
+    const uint32_t* __restrict__ w,
+    half* __restrict__ out,
+    const half* __restrict__ w_scales,
+    const uint32_t* __restrict__ w_zeros,
+    const int height,
+    const int dim,
+    const int width,
+    const int groupsize,
+    const int block_size_z,
+    const uint32_t* __restrict__ x_map,
+    bool no_zero
+)
+{
+    // Start of block
+    int x_column = block_size_z * blockIdx.z;
+    int x_column_end = min(dim, block_size_z * (blockIdx.z + 1));
+    int w_column = THREADS_X * blockIdx.x + threadIdx.x;
+    int x_row = THREADS_Y * blockIdx.y + threadIdx.y;
+    int iterations = (x_column_end - x_column) / 8;
+    // Views
+    MatrixView_half x_(x, height, dim);
+    MatrixView_half w_scales_(w_scales, dim / groupsize, width);
+    MatrixView_q4_row w_zeros_(w_zeros, dim / groupsize, width);
+    MatrixView_q4_column w_(w, dim, width);
+    MatrixView_half_rw out_(out, height, width);
+    // Zero output
+    if (!no_zero && blockIdx.z == 0 && (threadIdx.x & 1) == 0)
+    {
+        *((uint32_t*) out_.item_ptr(x_row, w_column)) = 0;
+        __syncthreads();
+    }
+    // Loop over part of x row (and w column)
+    half2 acc = {};
+    half acc_h = {};
+    if constexpr (use_groupsize)
+    {
+        // For quant matrices where groupsize divides BLOCK_SIZE_Z we always start on a group boundary, so this
+        // could be slightly faster
+        for (int k = x_column, group = x_column / groupsize; k < x_column + iterations * 8; group++, k += groupsize)
+        {
+            if constexpr (use_half2)
+            {
+                half2 w_scale = w_scales_.item_half2half2(group, w_column);
+                uint32_t w_zero = w_zeros_.item(group, w_column) + 1;
+                if constexpr (use_x_map) acc = dot_product_8_x_map(acc, x_, x_row, k, w_, k, w_column, w_scale, w_zero, groupsize / 8, x_map);
+                else                     acc = dot_product_8      (acc, x_, x_row, k, w_, k, w_column, w_scale, w_zero, groupsize / 8);
+            }
+            else
+            {
+                half w_scale = w_scales_.item(group, w_column);
+                uint32_t w_zero = w_zeros_.item(group, w_column) + 1;
+                if constexpr (use_x_map) acc_h = dot_product_8_x_map_h(acc_h, x_, x_row, k, w_, k, w_column, w_scale, w_zero, groupsize / 8, x_map);
+                else                     acc_h = dot_product_8_h      (acc_h, x_, x_row, k, w_, k, w_column, w_scale, w_zero, groupsize / 8);
+            }
+        }
+    }
+    else
+    {
+        // Otherwise assume groupsize is a multiple of 8, do 8 columns per iteration and trust the cache
+        for (int k = x_column; k < x_column + iterations * 8; k += 8)
+        {
+            if constexpr (use_half2)
+            {
+                int group = k / groupsize;
+                half2 w_scale = w_scales_.item_half2half2(group, w_column);
+                uint32_t w_zero = w_zeros_.item(group, w_column) + 1;
+                if constexpr (use_x_map) acc = dot_product_8_x_map(acc, x_, x_row, k, w_, k, w_column, w_scale, w_zero, 1, x_map);
+                else                     acc = dot_product_8      (acc, x_, x_row, k, w_, k, w_column, w_scale, w_zero, 1);
+            }
+            else
+            {
+                int group = k / groupsize;
+                half w_scale = w_scales_.item(group, w_column);
+                uint32_t w_zero = w_zeros_.item(group, w_column) + 1;
+                if constexpr (use_x_map) acc_h = dot_product_8_x_map_h(acc_h, x_, x_row, k, w_, k, w_column, w_scale, w_zero, 1, x_map);
+                else                     acc_h = dot_product_8_h      (acc_h, x_, x_row, k, w_, k, w_column, w_scale, w_zero, 1);
+            }
+        }
+    }
+    // Add to block result
+    if constexpr (use_half2)
+    {
+        half result = __hadd(acc.x, acc.y);
+        atomicAdd(out_.item_ptr(x_row, w_column), result);
+    }
+    else
+    {
+        atomicAdd(out_.item_ptr(x_row, w_column), acc_h);
+    }
+}
+fp_q4_matmul_kernel q4_matmul_kernel_pick(ExLlamaTuning* tuningParams, int block_size_z, int groupsize, uint32_t* x_map)
+{
+    // <bool use_half2, bool use_groupsize, bool use_x_map>
+    if (tuningParams->matmul_no_half2) {
+        if (block_size_z % groupsize == 0) {
+            if (x_map) return q4_matmul_kernel<false, true,  true >;
+            else       return q4_matmul_kernel<false, true,  false>;
+        } else {
+            if (x_map) return q4_matmul_kernel<false, false, true >;
+            else       return q4_matmul_kernel<false, false, false>;
+        }
+    } else {
+        if (block_size_z % groupsize == 0)
+        {
+            if (x_map) return q4_matmul_kernel<true,  true,  true >;
+            else       return q4_matmul_kernel<true,  true,  false>;
+        } else {
+            if (x_map) return q4_matmul_kernel<true,  false, true >;
+            else       return q4_matmul_kernel<true,  false, false>;
+        }
+    }
+};
+// Compute y = x @ w
+void q4_matmul_cuda
+(
+    ExLlamaTuning* tuningParams,
+    const half* x,
+    const int x_height,
+    const Q4Matrix* w,
+    half* out,
+    bool no_zero,
+    cudaStream_t alt_stream
+)
+{
+    int height = x_height;
+    int dim = w->height;
+    int width = w->width;
+    cudaSetDevice(w->device);
+    uint32_t* x_map = w->cuda_x_map;
+    const half* x_mapped = x;
+    if (x_map && !tuningParams->matmul_fused_remap && !alt_stream)
+    {
+        CudaBuffers* buffers = get_buffers(w->device);
+        column_remap_cuda(x, buffers->temp_state, x_height, dim, w->cuda_x_map);
+        x_mapped = buffers->temp_state;
+        x_map = NULL;
+    }
+    int block_size_z;
+    if (w->width == 4096) block_size_z = 384;           // 7B
+    else if (w->width == 11008) block_size_z = 256;
+    else if (w->width == 5120) block_size_z = 384;      // 13B
+    else if (w->width == 13824) block_size_z = 256;
+    else if (w->width == 6656) block_size_z = 256;      // 33B
+    else if (w->width == 17920) block_size_z = 128;
+    else block_size_z = 256;
+    //if (!no_zero) cudaMemsetAsync(out, 0, x_height * w->width * sizeof(half));
+    dim3 threads(THREADS_X, THREADS_Y, 1);
+    dim3 blocks
+    (
+        (width + threads.x - 1) / threads.x,
+        (height + threads.y - 1) / threads.y,
+        (dim + block_size_z - 1) / block_size_z
+    );
+    fp_q4_matmul_kernel kernel = q4_matmul_kernel_pick(tuningParams, block_size_z, w->groupsize, x_map);
+    kernel<<<blocks, threads, 0, alt_stream>>> (x_mapped, w->cuda_qweight, out, w->cuda_scales, w->cuda_qzeros, height, dim, width, w->groupsize, block_size_z, x_map, no_zero);
+}
+void q4_matmul_recons_cuda
+(
+    ExLlamaTuning* tuningParams,
+    const half* x,
+    const int x_height,
+    Q4Matrix* w,
+    half* out,
+    const cublasHandle_t handle,
+    bool no_zero
+)
+{
+    int height = x_height;
+    int dim = w->height;
+    int width = w->width;
+    cudaSetDevice(w->device);
+    CudaBuffers* buffers = get_buffers(w->device);
+    const half* x_mapped = x;
+    if (w->cuda_x_map)
+    {
+        column_remap_cuda(x, buffers->temp_state, x_height, dim, w->cuda_x_map);
+        x_mapped = buffers->temp_state;
+    }
+    w->reconstruct(buffers->temp_dq);
+    const half alpha = __float2half(1.0f);
+    const half beta = no_zero ? __float2half(1.0f) : __float2half(0.0f);
+    cublasHgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, width, height, dim, &alpha, buffers->temp_dq, width, x_mapped, dim, &beta, out, width);
+//     const float alpha = 1.0f;
+//     const float beta = no_zero ? 1.0f : 0.0f;
+//     cublasSgemmEx(handle, CUBLAS_OP_N, CUBLAS_OP_N, width, height, dim, &alpha, buffers->temp_dq, CUDA_R_16F, width,
+//                 x_mapped, CUDA_R_16F, dim, &beta, out, CUDA_R_16F, width);
+}
--- a/server/exllama_kernels/exllama_kernels/cuda_func/q4_matmul.cuh
+++ b/server/exllama_kernels/exllama_kernels/cuda_func/q4_matmul.cuh
+// Adapted from turboderp exllama: https://github.com/turboderp/exllama
+#ifndef _q4_matmul_cuh
+#define _q4_matmul_cuh
+#include <cuda_runtime.h>
+#include <cuda_fp16.h>
+#include <cstdint>
+#include <cstdio>
+#include <ATen/cuda/CUDAContext.h>
+#include "q4_matrix.cuh"
+#include "../tuning.h"
+void q4_matmul_cuda
+(
+    ExLlamaTuning* tuningParams,
+    const half* x,
+    const int x_height,
+    const Q4Matrix* w,
+    half* out,
+    bool no_zero = false,
+    cudaStream_t alt_stream = NULL
+);
+void q4_matmul_recons_cuda
+(
+    ExLlamaTuning* tuningParams,
+    const half* x,
+    const int x_height,
+    Q4Matrix* w,
+    half* out,
+    const cublasHandle_t handle,
+    bool no_zero = false
+);
+#endif
--- a/server/exllama_kernels/exllama_kernels/cuda_func/q4_matrix.cu
+++ b/server/exllama_kernels/exllama_kernels/cuda_func/q4_matrix.cu
+// Adapted from turboderp exllama: https://github.com/turboderp/exllama
+#include "q4_matrix.cuh"
+#include <vector>
+#include "../util.cuh"
+#include "../matrix.cuh"
+using namespace std;
+const int UNSHUF_BLOCKSIZE_X = 64;
+const int RECONS_THREADS_X = 64;      // Block size and thread count along columns in out, each thread converts 1 column
+const int RECONS_THREADS_Y = 1;       // Block size and thread count along rows in x and out, each thread converts 8 rows
+vector<Q4Matrix*> g_q4_matrices;
+void g_q4_keep_matrix(Q4Matrix* m)
+{
+    g_q4_matrices.push_back(m);
+}
+void g_q4_free_matrices()
+{
+    for (const auto& m : g_q4_matrices) delete m;
+    g_q4_matrices.clear();
+}
+Q4Matrix::Q4Matrix
+(
+    const int _height,
+    const int _width,
+    const int _groups,
+    uint32_t* _qweight,
+    uint32_t* _qzeros,
+    half* _scales,
+    uint32_t* _g_idx,
+    const int _device
+) :
+    height(_height),
+    width(_width),
+    groups(_groups),
+    device(_device)
+{
+    cudaSetDevice(device);
+    cuda_qweight = _qweight;
+    cuda_qzeros = _qzeros;
+    cuda_scales = _scales;
+    groupsize = height / groups;
+    if (_g_idx) make_sequential(_g_idx);
+}
+Q4Matrix::~Q4Matrix()
+{
+}
+// Make sequential
+__global__ void make_sequential_kernel
+(
+    const uint32_t* __restrict__ w,
+    uint32_t* __restrict__ w_new,
+    const uint32_t* __restrict__ x_map,
+    const int w_height,
+    const int w_width
+)
+{
+    const uint64_t* w2 = (uint64_t*) w;
+    uint64_t* w_new2 = (uint64_t*) w_new;
+    int w2_stride = w_width >> 1;
+    int w2_column = UNSHUF_BLOCKSIZE_X * blockIdx.x + threadIdx.x;
+    int w_new2_row = blockIdx.y;
+    int x_map_idx = w_new2_row << 3;
+    uint64_t dst = 0;
+    #pragma unroll
+    for (int i = 0; i < 8; i++)
+    {
+        int source_row = x_map[x_map_idx++];
+        int w2_row = source_row >> 3;
+        int w2_subrow = source_row & 0x07;
+        int w2_row_shift = w2_subrow << 2;
+        int wnew2_row_shift = i << 2;
+        uint64_t src = w2[w2_row * w2_stride + w2_column];
+        src >>= w2_row_shift;
+        src &= 0x0000000f0000000f;
+        src <<= wnew2_row_shift;
+        dst |= src;
+    }
+    w_new2[w_new2_row * w2_stride + w2_column] = dst;
+}
+void Q4Matrix::make_sequential(const uint32_t* cpu_g_idx)
+{
+    uint32_t* cuda_new_qweight = NULL;
+    cudaMalloc(&cuda_new_qweight, height / 8 * width * sizeof(uint32_t));
+    cudaMalloc(&cuda_x_map, height * sizeof(uint32_t));  // TODO: Should probably be allocated in PyTorch
+    uint32_t* cpu_g_idx_map = (uint32_t*) calloc(groups, sizeof(uint32_t));
+    uint32_t* cpu_x_map = (uint32_t*) malloc(height * sizeof(uint32_t));
+    uint32_t* cpu_x_map_inv = (uint32_t*) malloc(height * sizeof(uint32_t));
+    // Group histogram
+    for (int i = 0; i < height; i++) cpu_g_idx_map[cpu_g_idx[i]]++;
+    // Group map
+    for (int i = 0, acc = 0; i < groups; i++)
+    {
+        short tmp = cpu_g_idx_map[i];
+        cpu_g_idx_map[i] = acc;
+        acc += tmp;
+    }
+    // X map (inverse)
+    for (int row = 0; row < height; row++)
+    {
+        uint32_t target_group = cpu_g_idx[row];
+        uint32_t target_row = cpu_g_idx_map[target_group];
+        cpu_g_idx_map[target_group]++;
+        cpu_x_map_inv[row] = target_row;
+    }
+    // X map
+    for (int row = 0; row < height; row++) cpu_x_map[cpu_x_map_inv[row]] = row;
+    // Move to CUDA
+    cudaMemcpyAsync(cuda_x_map, cpu_x_map, height * sizeof(uint32_t), cudaMemcpyHostToDevice);
+    // Rearrange rows in w
+    dim3 threads(UNSHUF_BLOCKSIZE_X, 1, 1);
+    dim3 blocks(width / UNSHUF_BLOCKSIZE_X / 2, height / 8, 1);
+    make_sequential_kernel<<<blocks, threads>>>(cuda_qweight, cuda_new_qweight, cuda_x_map, height / 8, width);
+    // Replace qweights
+    cudaMemcpyAsync(cuda_qweight, cuda_new_qweight, height / 8 * width * sizeof(uint32_t), cudaMemcpyDeviceToDevice);
+    // Cleanup
+    cudaDeviceSynchronize();
+    cudaFree(cuda_new_qweight);
+    free(cpu_g_idx_map);
+    free(cpu_x_map);
+    free(cpu_x_map_inv);
+}
+__global__ void reconstruct_kernel
+(
+    const uint32_t* __restrict__ w,
+    half* __restrict__ out,  // (y)
+    const half* __restrict__ w_scales,
+    const uint32_t* __restrict__ w_zeros,
+    const int height,
+    const int width,
+    const int groupsize
+)
+{
+    // Start of block
+    int column = RECONS_THREADS_X * blockIdx.x + threadIdx.x;
+    int row = (RECONS_THREADS_Y * blockIdx.y + threadIdx.y) * 8;
+    // Views
+    MatrixView_q4_column w_(w, height, width);
+    MatrixView_half_rw out_(out, height, width);
+    MatrixView_half w_scales_(w_scales, height / groupsize, width);
+    MatrixView_q4_row w_zeros_(w_zeros, height / groupsize, width);
+    // Groupsize version
+    int group = row / groupsize;
+    half w_scale = w_scales_.item(group, column);
+    uint32_t w_zero = w_zeros_.item(group, column) + 1;
+    uint32_t w_read = w_.item_uint32_t(row, column);
+    half* out_ptr = out_.item_ptr(row, column);
+    #pragma unroll
+    for (int s = 0; s < 32; s += 4)
+    {
+        half w_item = __hmul(__int2half_rn((int)((w_read >> s) & 0x0f) - w_zero), w_scale);
+        *out_ptr = w_item; out_ptr += out_.width;
+    }
+}
+void Q4Matrix::reconstruct(half* out)
+{
+    dim3 threads(RECONS_THREADS_X, RECONS_THREADS_Y, 1);
+    dim3 blocks
+    (
+        (width + threads.x - 1) / threads.x,
+        (height / 8 + threads.y - 1) / threads.y,
+        1
+    );
+    reconstruct_kernel<<<blocks, threads>>>(cuda_qweight, out, cuda_scales, cuda_qzeros, height / 8, width, groupsize);
+}
\ No newline at end of file
--- a/server/exllama_kernels/exllama_kernels/cuda_func/q4_matrix.cuh
+++ b/server/exllama_kernels/exllama_kernels/cuda_func/q4_matrix.cuh
+// Adapted from turboderp exllama: https://github.com/turboderp/exllama
+#ifndef _q4_matrix_cuh
+#define _q4_matrix_cuh
+#include <cuda_runtime.h>
+#include <cuda_fp16.h>
+#include <cstdint>
+class Q4Matrix
+{
+public:
+    int device;
+    int height;
+    int width;
+    int groups;
+    int groupsize;
+    uint32_t* cuda_qweight = NULL;
+    uint32_t* cuda_qzeros = NULL;
+    half* cuda_scales = NULL;
+    uint32_t* cuda_x_map = NULL;
+    Q4Matrix
+    (
+        const int _height,
+        const int _width,
+        const int _groups,
+        uint32_t* _qweight,
+        uint32_t* _qzeros,
+        half* _scales,
+        uint32_t* _g_idx,
+        const int _device
+    );
+    ~Q4Matrix();
+    void reconstruct(half* out);
+private:
+    void make_sequential(const uint32_t* cpu_g_idx);
+};
+void g_q4_keep_matrix(Q4Matrix* m);
+void g_q4_free_matrices();
+#endif
\ No newline at end of file