Unverified Commit d5b5bc75 authored by Nicolas Patry's avatar Nicolas Patry Committed by GitHub
Browse files

feat(server): Add exllama GPTQ CUDA kernel support #553 (#666)

Just trying to get the integration tests to pass.


# What does this PR do?

<!--
Congratulations! You've made it this far! You're not quite done yet
though.

Once merged, your PR is going to appear in the release notes with the
title you set, so make sure it's a great title that fully reflects the
extent of your awesome contribution.

Then, please replace this with a description of the change and which
issue is fixed (if applicable). Please also include relevant motivation
and context. List any dependencies (if any) that are required for this
change.

Once you're done, someone will review your PR shortly (see the section
"Who can review?" below to tag some potential reviewers). They may
suggest changes to make the code even better. If no one reviewed your PR
after a week has passed, don't hesitate to post a new comment
@-mentioning the same persons---sometimes notifications get lost.
-->

<!-- Remove if not applicable -->

Fixes # (issue)


## Before submitting
- [ ] This PR fixes a typo or improves the docs (you can dismiss the
other checks if that's the case).
- [ ] Did you read the [contributor
guideline](https://github.com/huggingface/transformers/blob/main/CONTRIBUTING.md#start-contributing-pull-requests),
      Pull Request section?
- [ ] Was this discussed/approved via a Github issue or the
[forum](https://discuss.huggingface.co/)? Please add a link
      to it if that's the case.
- [ ] Did you make sure to update the documentation with your changes?
Here are the
[documentation
guidelines](https://github.com/huggingface/transformers/tree/main/docs),
and
[here are tips on formatting
docstrings](https://github.com/huggingface/transformers/tree/main/docs#writing-source-documentation

).
- [ ] Did you write any new necessary tests?


## Who can review?

Anyone in the community is free to review the PR once the tests have
passed. Feel free to tag
members/contributors who may be interested in your PR.

<!-- Your PR will be replied to more quickly if you can figure out the
right person to tag with @


@OlivierDehaene OR @Narsil

 -->

---------
Co-authored-by: default avatarFelix Marty <9808326+fxmarty@users.noreply.github.com>
parent bf94df3c
...@@ -108,6 +108,17 @@ COPY server/Makefile-flash-att-v2 Makefile ...@@ -108,6 +108,17 @@ COPY server/Makefile-flash-att-v2 Makefile
# Build specific version of flash attention v2 # Build specific version of flash attention v2
RUN make build-flash-attention-v2 RUN make build-flash-attention-v2
# Build Transformers exllama kernels
FROM kernel-builder as exllama-kernels-builder
WORKDIR /usr/src
COPY server/exllama_kernels/ .
# Build specific version of transformers
RUN TORCH_CUDA_ARCH_LIST="8.0;8.6+PTX" python setup.py build
# Build Transformers CUDA kernels # Build Transformers CUDA kernels
FROM kernel-builder as custom-kernels-builder FROM kernel-builder as custom-kernels-builder
...@@ -161,6 +172,8 @@ COPY --from=flash-att-v2-builder /usr/src/flash-attention-v2/build/lib.linux-x86 ...@@ -161,6 +172,8 @@ COPY --from=flash-att-v2-builder /usr/src/flash-attention-v2/build/lib.linux-x86
# Copy build artifacts from custom kernels builder # Copy build artifacts from custom kernels builder
COPY --from=custom-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-39 /opt/conda/lib/python3.9/site-packages COPY --from=custom-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-39 /opt/conda/lib/python3.9/site-packages
# Copy build artifacts from exllama kernels builder
COPY --from=exllama-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-39 /opt/conda/lib/python3.9/site-packages
# Copy builds artifacts from vllm builder # Copy builds artifacts from vllm builder
COPY --from=vllm-builder /usr/src/vllm/build/lib.linux-x86_64-cpython-39 /opt/conda/lib/python3.9/site-packages COPY --from=vllm-builder /usr/src/vllm/build/lib.linux-x86_64-cpython-39 /opt/conda/lib/python3.9/site-packages
......
...@@ -56,3 +56,6 @@ run-bloom: ...@@ -56,3 +56,6 @@ run-bloom:
run-bloom-quantize: run-bloom-quantize:
text-generation-launcher --model-id bigscience/bloom --num-shard 8 --quantize --port 8080 text-generation-launcher --model-id bigscience/bloom --num-shard 8 --quantize --port 8080
clean:
rm -rf target aml
...@@ -230,15 +230,16 @@ def launcher(event_loop): ...@@ -230,15 +230,16 @@ def launcher(event_loop):
shard_uds_path, shard_uds_path,
] ]
env = os.environ
if num_shard is not None: if num_shard is not None:
args.extend(["--num-shard", str(num_shard)]) args.extend(["--num-shard", str(num_shard)])
if quantize: if quantize is not None:
args.append("--quantize") args.append("--quantize")
args.append("bitsandbytes") args.append(quantize)
if trust_remote_code: if trust_remote_code:
args.append("--trust-remote-code") args.append("--trust-remote-code")
env = os.environ
env["LOG_LEVEL"] = "info,text_generation_router=debug" env["LOG_LEVEL"] = "info,text_generation_router=debug"
if not use_flash_attention: if not use_flash_attention:
...@@ -275,9 +276,9 @@ def launcher(event_loop): ...@@ -275,9 +276,9 @@ def launcher(event_loop):
if num_shard is not None: if num_shard is not None:
args.extend(["--num-shard", str(num_shard)]) args.extend(["--num-shard", str(num_shard)])
if quantize: if quantize is not None:
args.append("--quantize") args.append("--quantize")
args.append("bitsandbytes") args.append(quantize)
if trust_remote_code: if trust_remote_code:
args.append("--trust-remote-code") args.append("--trust-remote-code")
......
{
"details": {
"best_of_sequences": null,
"finish_reason": "length",
"generated_tokens": 10,
"prefill": [
{
"id": 1,
"logprob": null,
"text": "<s>"
},
{
"id": 4321,
"logprob": -9.59375,
"text": "Test"
},
{
"id": 2009,
"logprob": -9.6640625,
"text": "request"
}
],
"seed": null,
"tokens": [
{
"id": 29918,
"logprob": -2.3867188,
"special": false,
"text": "_"
},
{
"id": 5338,
"logprob": -2.8183594,
"special": false,
"text": "uri"
},
{
"id": 13,
"logprob": -1.6367188,
"special": false,
"text": "\n"
},
{
"id": 3057,
"logprob": -1.0527344,
"special": false,
"text": "Test"
},
{
"id": 2009,
"logprob": -0.6542969,
"special": false,
"text": " request"
},
{
"id": 29918,
"logprob": -0.056121826,
"special": false,
"text": "_"
},
{
"id": 5338,
"logprob": -0.01600647,
"special": false,
"text": "uri"
},
{
"id": 13,
"logprob": -0.87939453,
"special": false,
"text": "\n"
},
{
"id": 3057,
"logprob": -0.7529297,
"special": false,
"text": "Test"
},
{
"id": 2009,
"logprob": -0.2980957,
"special": false,
"text": " request"
}
]
},
"generated_text": "_uri\nTest request_uri\nTest request"
}
{
"details": {
"best_of_sequences": null,
"finish_reason": "length",
"generated_tokens": 10,
"prefill": [
{
"id": 1,
"logprob": null,
"text": "<s>"
},
{
"id": 4321,
"logprob": -9.6015625,
"text": "Test"
},
{
"id": 2009,
"logprob": -9.6640625,
"text": "request"
}
],
"seed": 0,
"tokens": [
{
"id": 29899,
"logprob": -1.1640625,
"special": false,
"text": "-"
},
{
"id": 1454,
"logprob": -0.07543945,
"special": false,
"text": "for"
},
{
"id": 29899,
"logprob": 0.0,
"special": false,
"text": "-"
},
{
"id": 9342,
"logprob": 0.0,
"special": false,
"text": "comment"
},
{
"id": 29901,
"logprob": 0.0,
"special": false,
"text": ":"
},
{
"id": 396,
"logprob": -0.2956543,
"special": false,
"text": " #"
},
{
"id": 29906,
"logprob": -0.52734375,
"special": false,
"text": "2"
},
{
"id": 29900,
"logprob": -0.6899414,
"special": false,
"text": "0"
},
{
"id": 29896,
"logprob": 0.0,
"special": false,
"text": "1"
},
{
"id": 29946,
"logprob": -1.5068359,
"special": false,
"text": "4"
}
]
},
"generated_text": "Test request-for-comment: #2014"
}
[
{
"details": {
"best_of_sequences": null,
"finish_reason": "length",
"generated_tokens": 10,
"prefill": [
{
"id": 1,
"logprob": null,
"text": "<s>"
},
{
"id": 4321,
"logprob": -9.6015625,
"text": "Test"
},
{
"id": 2009,
"logprob": -9.671875,
"text": "request"
}
],
"seed": null,
"tokens": [
{
"id": 29918,
"logprob": -2.3828125,
"special": false,
"text": "_"
},
{
"id": 5338,
"logprob": -2.8105469,
"special": false,
"text": "uri"
},
{
"id": 13,
"logprob": -1.6396484,
"special": false,
"text": "\n"
},
{
"id": 3057,
"logprob": -1.0546875,
"special": false,
"text": "Test"
},
{
"id": 2009,
"logprob": -0.6513672,
"special": false,
"text": " request"
},
{
"id": 29918,
"logprob": -0.056365967,
"special": false,
"text": "_"
},
{
"id": 5338,
"logprob": -0.016082764,
"special": false,
"text": "uri"
},
{
"id": 13,
"logprob": -0.87841797,
"special": false,
"text": "\n"
},
{
"id": 3057,
"logprob": -0.7548828,
"special": false,
"text": "Test"
},
{
"id": 2009,
"logprob": -0.29711914,
"special": false,
"text": " request"
}
]
},
"generated_text": "_uri\nTest request_uri\nTest request"
},
{
"details": {
"best_of_sequences": null,
"finish_reason": "length",
"generated_tokens": 10,
"prefill": [
{
"id": 1,
"logprob": null,
"text": "<s>"
},
{
"id": 4321,
"logprob": -9.6015625,
"text": "Test"
},
{
"id": 2009,
"logprob": -9.6640625,
"text": "request"
}
],
"seed": null,
"tokens": [
{
"id": 29918,
"logprob": -2.3828125,
"special": false,
"text": "_"
},
{
"id": 5338,
"logprob": -2.828125,
"special": false,
"text": "uri"
},
{
"id": 13,
"logprob": -1.6386719,
"special": false,
"text": "\n"
},
{
"id": 3057,
"logprob": -1.0527344,
"special": false,
"text": "Test"
},
{
"id": 2009,
"logprob": -0.6542969,
"special": false,
"text": " request"
},
{
"id": 29918,
"logprob": -0.055877686,
"special": false,
"text": "_"
},
{
"id": 5338,
"logprob": -0.016021729,
"special": false,
"text": "uri"
},
{
"id": 13,
"logprob": -0.8769531,
"special": false,
"text": "\n"
},
{
"id": 3057,
"logprob": -0.7583008,
"special": false,
"text": "Test"
},
{
"id": 2009,
"logprob": -0.29833984,
"special": false,
"text": " request"
}
]
},
"generated_text": "_uri\nTest request_uri\nTest request"
},
{
"details": {
"best_of_sequences": null,
"finish_reason": "length",
"generated_tokens": 10,
"prefill": [
{
"id": 1,
"logprob": null,
"text": "<s>"
},
{
"id": 4321,
"logprob": -9.6015625,
"text": "Test"
},
{
"id": 2009,
"logprob": -9.671875,
"text": "request"
}
],
"seed": null,
"tokens": [
{
"id": 29918,
"logprob": -2.3847656,
"special": false,
"text": "_"
},
{
"id": 5338,
"logprob": -2.8144531,
"special": false,
"text": "uri"
},
{
"id": 13,
"logprob": -1.6396484,
"special": false,
"text": "\n"
},
{
"id": 3057,
"logprob": -1.0527344,
"special": false,
"text": "Test"
},
{
"id": 2009,
"logprob": -0.65478516,
"special": false,
"text": " request"
},
{
"id": 29918,
"logprob": -0.056243896,
"special": false,
"text": "_"
},
{
"id": 5338,
"logprob": -0.016143799,
"special": false,
"text": "uri"
},
{
"id": 13,
"logprob": -0.8808594,
"special": false,
"text": "\n"
},
{
"id": 3057,
"logprob": -0.75341797,
"special": false,
"text": "Test"
},
{
"id": 2009,
"logprob": -0.2956543,
"special": false,
"text": " request"
}
]
},
"generated_text": "_uri\nTest request_uri\nTest request"
},
{
"details": {
"best_of_sequences": null,
"finish_reason": "length",
"generated_tokens": 10,
"prefill": [
{
"id": 1,
"logprob": null,
"text": "<s>"
},
{
"id": 4321,
"logprob": -9.6015625,
"text": "Test"
},
{
"id": 2009,
"logprob": -9.6640625,
"text": "request"
}
],
"seed": null,
"tokens": [
{
"id": 29918,
"logprob": -2.3769531,
"special": false,
"text": "_"
},
{
"id": 5338,
"logprob": -2.8183594,
"special": false,
"text": "uri"
},
{
"id": 13,
"logprob": -1.6396484,
"special": false,
"text": "\n"
},
{
"id": 3057,
"logprob": -1.0546875,
"special": false,
"text": "Test"
},
{
"id": 2009,
"logprob": -0.65478516,
"special": false,
"text": " request"
},
{
"id": 29918,
"logprob": -0.05557251,
"special": false,
"text": "_"
},
{
"id": 5338,
"logprob": -0.01612854,
"special": false,
"text": "uri"
},
{
"id": 13,
"logprob": -0.8730469,
"special": false,
"text": "\n"
},
{
"id": 3057,
"logprob": -0.7519531,
"special": false,
"text": "Test"
},
{
"id": 2009,
"logprob": -0.29785156,
"special": false,
"text": " request"
}
]
},
"generated_text": "_uri\nTest request_uri\nTest request"
}
]
{
"generated_text": "\n return sum(L) / len(L)\n\n\ndef geometric_mean(L",
"details": {
"best_of_sequences": null,
"finish_reason": "length",
"generated_tokens": 20,
"seed": null,
"prefill": [
{
"id": 589,
"text": "def",
"logprob": null
},
{
"id": 3226,
"text": " ge",
"logprob": -9.0234375
},
{
"id": 21017,
"text": "ometric",
"logprob": -9.0859375
},
{
"id": 81,
"text": "_",
"logprob": -0.25878906
},
{
"id": 6009,
"text": "mean",
"logprob": -2.2109375
},
{
"id": 26,
"text": "(",
"logprob": -0.30371094
},
{
"id": 62,
"text": "L",
"logprob": -5.6054688
},
{
"id": 44,
"text": ":",
"logprob": -3.0722656
},
{
"id": 1682,
"text": " List",
"logprob": -0.6879883
},
{
"id": 77,
"text": "[",
"logprob": -0.38500977
},
{
"id": 1808,
"text": "float",
"logprob": -0.984375
},
{
"id": 10794,
"text": "]):",
"logprob": -2.5351562
}
],
"tokens": [
{
"id": 284,
"text": "\n ",
"logprob": -1.1738281,
"special": false
},
{
"id": 442,
"text": " return",
"logprob": -0.95947266,
"special": false
},
{
"id": 3632,
"text": " sum",
"logprob": -1.4199219,
"special": false
},
{
"id": 26,
"text": "(",
"logprob": -0.085876465,
"special": false
},
{
"id": 62,
"text": "L",
"logprob": -0.09875488,
"special": false
},
{
"id": 27,
"text": ")",
"logprob": -0.30517578,
"special": false
},
{
"id": 517,
"text": " /",
"logprob": -0.42089844,
"special": false
},
{
"id": 2069,
"text": " len",
"logprob": -0.042053223,
"special": false
},
{
"id": 26,
"text": "(",
"logprob": -0.0011806488,
"special": false
},
{
"id": 62,
"text": "L",
"logprob": -0.0005259514,
"special": false
},
{
"id": 27,
"text": ")",
"logprob": -0.0017633438,
"special": false
},
{
"id": 478,
"text": "\n\n",
"logprob": -0.69189453,
"special": false
},
{
"id": 203,
"text": "\n",
"logprob": -0.041870117,
"special": false
},
{
"id": 589,
"text": "def",
"logprob": -0.27856445,
"special": false
},
{
"id": 3226,
"text": " ge",
"logprob": -1.7255859,
"special": false
},
{
"id": 21017,
"text": "ometric",
"logprob": -0.011291504,
"special": false
},
{
"id": 81,
"text": "_",
"logprob": -0.008430481,
"special": false
},
{
"id": 6009,
"text": "mean",
"logprob": -0.025787354,
"special": false
},
{
"id": 26,
"text": "(",
"logprob": -0.073913574,
"special": false
},
{
"id": 62,
"text": "L",
"logprob": -0.09967041,
"special": false
}
]
}
}
{
"details": {
"best_of_sequences": null,
"finish_reason": "length",
"generated_tokens": 20,
"prefill": [
{
"id": 589,
"logprob": null,
"text": "def"
},
{
"id": 3226,
"logprob": -9.0234375,
"text": " ge"
},
{
"id": 21017,
"logprob": -9.09375,
"text": "ometric"
},
{
"id": 81,
"logprob": -0.25976562,
"text": "_"
},
{
"id": 6009,
"logprob": -2.2148438,
"text": "mean"
},
{
"id": 26,
"logprob": -0.3010254,
"text": "("
},
{
"id": 62,
"logprob": -5.6757812,
"text": "L"
},
{
"id": 44,
"logprob": -3.0898438,
"text": ":"
},
{
"id": 1682,
"logprob": -0.6791992,
"text": " List"
},
{
"id": 77,
"logprob": -0.38891602,
"text": "["
},
{
"id": 1808,
"logprob": -0.92041016,
"text": "float"
},
{
"id": 10794,
"logprob": -2.5390625,
"text": "]):"
}
],
"seed": 0,
"tokens": [
{
"id": 284,
"logprob": 0.0,
"special": false,
"text": "\n "
},
{
"id": 442,
"logprob": 0.0,
"special": false,
"text": " return"
},
{
"id": 11665,
"logprob": -1.6005859,
"special": false,
"text": " reduce"
},
{
"id": 26,
"logprob": 0.0,
"special": false,
"text": "("
},
{
"id": 5962,
"logprob": 0.0,
"special": false,
"text": "lambda"
},
{
"id": 816,
"logprob": 0.0,
"special": false,
"text": " x"
},
{
"id": 30,
"logprob": 0.0,
"special": false,
"text": ","
},
{
"id": 533,
"logprob": 0.0,
"special": false,
"text": " y"
},
{
"id": 44,
"logprob": 0.0,
"special": false,
"text": ":"
},
{
"id": 816,
"logprob": 0.0,
"special": false,
"text": " x"
},
{
"id": 319,
"logprob": 0.0,
"special": false,
"text": " *"
},
{
"id": 533,
"logprob": 0.0,
"special": false,
"text": " y"
},
{
"id": 30,
"logprob": 0.0,
"special": false,
"text": ","
},
{
"id": 498,
"logprob": 0.0,
"special": false,
"text": " L"
},
{
"id": 27,
"logprob": 0.0,
"special": false,
"text": ")"
},
{
"id": 203,
"logprob": -0.11968994,
"special": false,
"text": "\n"
},
{
"id": 203,
"logprob": 0.0,
"special": false,
"text": "\n"
},
{
"id": 589,
"logprob": 0.0,
"special": false,
"text": "def"
},
{
"id": 3226,
"logprob": 0.0,
"special": false,
"text": " ge"
},
{
"id": 21017,
"logprob": 0.0,
"special": false,
"text": "ometric"
}
]
},
"generated_text": "\n return reduce(lambda x, y: x * y, L)\n\ndef geometric"
}
[
{
"details": {
"best_of_sequences": null,
"finish_reason": "length",
"generated_tokens": 10,
"prefill": [
{
"id": 589,
"logprob": null,
"text": "def"
},
{
"id": 3226,
"logprob": -9.0234375,
"text": " ge"
},
{
"id": 21017,
"logprob": -9.0859375,
"text": "ometric"
},
{
"id": 81,
"logprob": -0.25927734,
"text": "_"
},
{
"id": 6009,
"logprob": -2.25,
"text": "mean"
},
{
"id": 26,
"logprob": -0.30126953,
"text": "("
},
{
"id": 62,
"logprob": -5.7539062,
"text": "L"
},
{
"id": 44,
"logprob": -3.0878906,
"text": ":"
},
{
"id": 1682,
"logprob": -0.6845703,
"text": " List"
},
{
"id": 77,
"logprob": -0.3918457,
"text": "["
},
{
"id": 1808,
"logprob": -0.8798828,
"text": "float"
},
{
"id": 10794,
"logprob": -2.4980469,
"text": "]):"
}
],
"seed": null,
"tokens": [
{
"id": 284,
"logprob": -1.1533203,
"special": false,
"text": "\n "
},
{
"id": 442,
"logprob": -0.91796875,
"special": false,
"text": " return"
},
{
"id": 3632,
"logprob": -1.3291016,
"special": false,
"text": " sum"
},
{
"id": 26,
"logprob": -0.08062744,
"special": false,
"text": "("
},
{
"id": 62,
"logprob": -0.097717285,
"special": false,
"text": "L"
},
{
"id": 27,
"logprob": -0.29003906,
"special": false,
"text": ")"
},
{
"id": 517,
"logprob": -0.34958984,
"special": false,
"text": " /"
},
{
"id": 2069,
"logprob": -0.03829956,
"special": false,
"text": " len"
},
{
"id": 26,
"logprob": -0.0011987686,
"special": false,
"text": "("
},
{
"id": 62,
"logprob": -0.00050878525,
"special": false,
"text": "L"
}
]
},
"generated_text": "\n return sum(L) / len(L"
},
{
"details": {
"best_of_sequences": null,
"finish_reason": "length",
"generated_tokens": 10,
"prefill": [
{
"id": 589,
"logprob": null,
"text": "def"
},
{
"id": 3226,
"logprob": -9.0234375,
"text": " ge"
},
{
"id": 21017,
"logprob": -9.0859375,
"text": "ometric"
},
{
"id": 81,
"logprob": -0.25878906,
"text": "_"
},
{
"id": 6009,
"logprob": -2.2109375,
"text": "mean"
},
{
"id": 26,
"logprob": -0.30371094,
"text": "("
},
{
"id": 62,
"logprob": -5.6054688,
"text": "L"
},
{
"id": 44,
"logprob": -3.0722656,
"text": ":"
},
{
"id": 1682,
"logprob": -0.6879883,
"text": " List"
},
{
"id": 77,
"logprob": -0.38500977,
"text": "["
},
{
"id": 1808,
"logprob": -0.984375,
"text": "float"
},
{
"id": 10794,
"logprob": -2.5351562,
"text": "]):"
}
],
"seed": null,
"tokens": [
{
"id": 284,
"logprob": -1.1738281,
"special": false,
"text": "\n "
},
{
"id": 442,
"logprob": -0.9584961,
"special": false,
"text": " return"
},
{
"id": 3632,
"logprob": -1.4169922,
"special": false,
"text": " sum"
},
{
"id": 26,
"logprob": -0.085876465,
"special": false,
"text": "("
},
{
"id": 62,
"logprob": -0.0982666,
"special": false,
"text": "L"
},
{
"id": 27,
"logprob": -0.3022461,
"special": false,
"text": ")"
},
{
"id": 517,
"logprob": -0.40504883,
"special": false,
"text": " /"
},
{
"id": 2069,
"logprob": -0.041656494,
"special": false,
"text": " len"
},
{
"id": 26,
"logprob": -0.0011844635,
"special": false,
"text": "("
},
{
"id": 62,
"logprob": -0.0005264282,
"special": false,
"text": "L"
}
]
},
"generated_text": "\n return sum(L) / len(L"
},
{
"details": {
"best_of_sequences": null,
"finish_reason": "length",
"generated_tokens": 10,
"prefill": [
{
"id": 589,
"logprob": null,
"text": "def"
},
{
"id": 3226,
"logprob": -9.0234375,
"text": " ge"
},
{
"id": 21017,
"logprob": -9.0859375,
"text": "ometric"
},
{
"id": 81,
"logprob": -0.25927734,
"text": "_"
},
{
"id": 6009,
"logprob": -2.25,
"text": "mean"
},
{
"id": 26,
"logprob": -0.30126953,
"text": "("
},
{
"id": 62,
"logprob": -5.7539062,
"text": "L"
},
{
"id": 44,
"logprob": -3.0878906,
"text": ":"
},
{
"id": 1682,
"logprob": -0.6845703,
"text": " List"
},
{
"id": 77,
"logprob": -0.3918457,
"text": "["
},
{
"id": 1808,
"logprob": -0.8798828,
"text": "float"
},
{
"id": 10794,
"logprob": -2.4980469,
"text": "]):"
}
],
"seed": null,
"tokens": [
{
"id": 284,
"logprob": -1.1533203,
"special": false,
"text": "\n "
},
{
"id": 442,
"logprob": -0.9165039,
"special": false,
"text": " return"
},
{
"id": 3632,
"logprob": -1.328125,
"special": false,
"text": " sum"
},
{
"id": 26,
"logprob": -0.07946777,
"special": false,
"text": "("
},
{
"id": 62,
"logprob": -0.09820557,
"special": false,
"text": "L"
},
{
"id": 27,
"logprob": -0.28930664,
"special": false,
"text": ")"
},
{
"id": 517,
"logprob": -0.34592773,
"special": false,
"text": " /"
},
{
"id": 2069,
"logprob": -0.038330078,
"special": false,
"text": " len"
},
{
"id": 26,
"logprob": -0.0011940002,
"special": false,
"text": "("
},
{
"id": 62,
"logprob": -0.00050878525,
"special": false,
"text": "L"
}
]
},
"generated_text": "\n return sum(L) / len(L"
},
{
"details": {
"best_of_sequences": null,
"finish_reason": "length",
"generated_tokens": 10,
"prefill": [
{
"id": 589,
"logprob": null,
"text": "def"
},
{
"id": 3226,
"logprob": -9.0234375,
"text": " ge"
},
{
"id": 21017,
"logprob": -9.0859375,
"text": "ometric"
},
{
"id": 81,
"logprob": -0.25927734,
"text": "_"
},
{
"id": 6009,
"logprob": -2.25,
"text": "mean"
},
{
"id": 26,
"logprob": -0.30126953,
"text": "("
},
{
"id": 62,
"logprob": -5.7539062,
"text": "L"
},
{
"id": 44,
"logprob": -3.0878906,
"text": ":"
},
{
"id": 1682,
"logprob": -0.6845703,
"text": " List"
},
{
"id": 77,
"logprob": -0.3918457,
"text": "["
},
{
"id": 1808,
"logprob": -0.8798828,
"text": "float"
},
{
"id": 10794,
"logprob": -2.4980469,
"text": "]):"
}
],
"seed": null,
"tokens": [
{
"id": 284,
"logprob": -1.1533203,
"special": false,
"text": "\n "
},
{
"id": 442,
"logprob": -0.91259766,
"special": false,
"text": " return"
},
{
"id": 3632,
"logprob": -1.3251953,
"special": false,
"text": " sum"
},
{
"id": 26,
"logprob": -0.08062744,
"special": false,
"text": "("
},
{
"id": 62,
"logprob": -0.09906006,
"special": false,
"text": "L"
},
{
"id": 27,
"logprob": -0.28979492,
"special": false,
"text": ")"
},
{
"id": 517,
"logprob": -0.35958984,
"special": false,
"text": " /"
},
{
"id": 2069,
"logprob": -0.038604736,
"special": false,
"text": " len"
},
{
"id": 26,
"logprob": -0.0011901855,
"special": false,
"text": "("
},
{
"id": 62,
"logprob": -0.0005078316,
"special": false,
"text": "L"
}
]
},
"generated_text": "\n return sum(L) / len(L"
}
]
import pytest
@pytest.fixture(scope="module")
def flash_llama_gptq_handle(launcher):
with launcher("huggingface/llama-7b-gptq", num_shard=2, quantize="gptq") as handle:
yield handle
@pytest.fixture(scope="module")
async def flash_llama_gptq(flash_llama_gptq_handle):
await flash_llama_gptq_handle.health(300)
return flash_llama_gptq_handle.client
@pytest.mark.asyncio
@pytest.mark.private
async def test_flash_llama_gptq(flash_llama_gptq, response_snapshot):
response = await flash_llama_gptq.generate(
"Test request", max_new_tokens=10, decoder_input_details=True
)
assert response.details.generated_tokens == 10
assert response == response_snapshot
@pytest.mark.asyncio
@pytest.mark.private
async def test_flash_llama_gptq_all_params(flash_llama_gptq, response_snapshot):
response = await flash_llama_gptq.generate(
"Test request",
max_new_tokens=10,
repetition_penalty=1.2,
return_full_text=True,
temperature=0.5,
top_p=0.9,
top_k=10,
truncate=5,
typical_p=0.9,
watermark=True,
decoder_input_details=True,
seed=0,
)
assert response.details.generated_tokens == 10
assert response == response_snapshot
@pytest.mark.asyncio
@pytest.mark.private
async def test_flash_llama_gptq_load(flash_llama_gptq, generate_load, response_snapshot):
responses = await generate_load(flash_llama_gptq, "Test request", max_new_tokens=10, n=4)
assert len(responses) == 4
assert all([r.generated_text == responses[0].generated_text for r in responses])
assert responses == response_snapshot
import pytest
@pytest.fixture(scope="module")
def flash_starcoder_gptq_handle(launcher):
with launcher("Narsil/starcoder-gptq", num_shard=2, quantize="gptq") as handle:
yield handle
@pytest.fixture(scope="module")
async def flash_starcoder_gptq(flash_starcoder_gptq_handle):
await flash_starcoder_gptq_handle.health(300)
return flash_starcoder_gptq_handle.client
@pytest.mark.asyncio
@pytest.mark.private
async def test_flash_starcoder_gptq(flash_starcoder_gptq, response_snapshot):
response = await flash_starcoder_gptq.generate(
"def geometric_mean(L: List[float]):", max_new_tokens=20, decoder_input_details=True,
)
assert response.details.generated_tokens == 20
assert response == response_snapshot
@pytest.mark.asyncio
@pytest.mark.private
async def test_flash_starcoder_gptq_default_params(flash_starcoder_gptq, response_snapshot):
response = await flash_starcoder_gptq.generate(
"def geometric_mean(L: List[float]):",
max_new_tokens=20,
temperature=0.2,
top_p=0.95,
decoder_input_details=True,
seed=0,
)
assert response.details.generated_tokens == 20
assert response == response_snapshot
@pytest.mark.asyncio
@pytest.mark.private
async def test_flash_starcoder_gptq_load(flash_starcoder_gptq, generate_load, response_snapshot):
responses = await generate_load(flash_starcoder_gptq, "def geometric_mean(L: List[float]):", max_new_tokens=10, n=4)
assert len(responses) == 4
assert all([r.generated_text == responses[0].generated_text for r in responses])
assert responses == response_snapshot
\ No newline at end of file
// Adapted from turboderp exllama: https://github.com/turboderp/exllama
#define _cuda_buffers_cu
#include "cuda_buffers.cuh"
CudaBuffers* g_buffers[CUDA_MAX_DEVICES] = {NULL};
// __constant__ half2 q4_table[16][256];
// half2 q4_table_host[16][256];
// bool q4_table_init = false;
CudaBuffers::CudaBuffers
(
int _device,
half* _temp_state,
half* _temp_dq
) :
device(_device),
temp_state(_temp_state),
temp_dq(_temp_dq)
{
cudaSetDevice(_device);
cudaStreamCreate(&alt_stream_1);
cudaStreamCreate(&alt_stream_2);
cudaStreamCreate(&alt_stream_3);
cudaEventCreate(&alt_stream_1_done);
cudaEventCreate(&alt_stream_2_done);
cudaEventCreate(&alt_stream_3_done);
}
CudaBuffers::~CudaBuffers()
{
cudaStreamDestroy(alt_stream_1);
cudaStreamDestroy(alt_stream_2);
cudaStreamDestroy(alt_stream_3);
cudaEventDestroy(alt_stream_1_done);
cudaEventDestroy(alt_stream_2_done);
cudaEventDestroy(alt_stream_3_done);
}
CudaBuffers* get_buffers(const int device_index)
{
return g_buffers[device_index];
}
void prepare_buffers_cuda
(
int _device,
half* _temp_state,
half* _temp_dq
)
{
CudaBuffers* buffers = new CudaBuffers
(
_device,
_temp_state,
_temp_dq
);
g_buffers[_device] = buffers;
}
void cleanup_buffers_cuda()
{
for (int i = 0; i < CUDA_MAX_DEVICES; i++)
{
if (!g_buffers[i]) continue;
delete g_buffers[i];
g_buffers[i] = NULL;
}
}
// Adapted from turboderp exllama: https://github.com/turboderp/exllama
#ifndef _cuda_buffers_cuh
#define _cuda_buffers_cuh
#include <cuda_runtime.h>
#include <cuda_fp16.h>
#include <cstdint>
#include <cstdio>
const int CUDA_MAX_DEVICES = 16;
// #ifndef _cuda_buffers_cu
// extern __constant__ half2 q4_table[16][256];
// #endif
class CudaBuffers
{
public:
int device;
half* temp_state; // [max_hidden_rows * intermediate_size]
half* temp_dq; // size of largest quant tensor * 8
cudaStream_t alt_stream_1;
cudaStream_t alt_stream_2;
cudaStream_t alt_stream_3;
cudaEvent_t alt_stream_1_done;
cudaEvent_t alt_stream_2_done;
cudaEvent_t alt_stream_3_done;
CudaBuffers
(
int _device,
half* _temp_state,
half* _temp_dq
);
~CudaBuffers();
};
CudaBuffers* get_buffers(const int device_index);
void prepare_buffers_cuda
(
int _device,
half* _temp_state,
half* _temp_dq
);
void cleanup_buffers_cuda();
#endif
// Adapted from turboderp exllama: https://github.com/turboderp/exllama
#ifndef _cuda_compat_cuh
#define _cuda_compat_cuh
// atomicAdd for half types, to support CC < 7.x
__device__ __forceinline__ void atomicAdd_half(half* address, half val)
{
unsigned int * address_as_ui = (unsigned int *) ((char *)address - ((size_t)address & 2));
unsigned int old = *address_as_ui;
unsigned int assumed;
do
{
assumed = old;
__half_raw hsum;
hsum.x = (size_t)address & 2 ? (old >> 16) : (old & 0xffff);
half tmpres = __hadd(hsum, val);
hsum = __half_raw(tmpres);
old = (size_t)address & 2 ? (old & 0xffff) | (hsum.x << 16) : (old & 0xffff0000) | hsum.x;
old = atomicCAS(address_as_ui, assumed, old);
}
while (assumed != old);
}
// atomicAdd for half2 types
__device__ __forceinline__ void atomicAdd_half2(half2* address, half2 val)
{
unsigned int* address_as_ui = (unsigned int*)address;
unsigned int old = *address_as_ui;
unsigned int assumed;
do
{
assumed = old;
half2 old_val = *((half2*)&old);
half2 new_val = __hadd2(old_val, val);
old = atomicCAS(address_as_ui, assumed, *((unsigned int*)&new_val));
}
while (assumed != old);
}
//
#if defined(__CUDA_ARCH__)
#if __CUDA_ARCH__ < 700
__device__ __forceinline__ void atomicAdd(half* address, half val) { atomicAdd_half(address, val); }
#if __CUDA_ARCH__ < 600
__device__ __forceinline__ void atomicAdd(half2* address, half2 val) { atomicAdd_half2(address, val); }
#endif
#endif
#endif
#endif
// Adapted from turboderp exllama: https://github.com/turboderp/exllama
#include "column_remap.cuh"
#include "../util.cuh"
const int SHUF_BLOCKSIZE_X = 256;
const int SHUF_BLOCKSIZE_Y = 16;
__global__ void column_remap_kernel
(
const half* __restrict__ x,
half* __restrict__ x_new,
const int x_width,
const int x_height,
const uint32_t* x_map
)
{
int x_column = SHUF_BLOCKSIZE_X * blockIdx.x + threadIdx.x;
int x_row = SHUF_BLOCKSIZE_Y * blockIdx.y;
int x_stride = x_width;
int x_idx = x_row * x_stride + x_column;
int x_row_end = min(x_row + SHUF_BLOCKSIZE_Y, x_height);
int x_idx_end = x_row_end * x_stride + x_column;
int s_column = x_map[x_column];
int s_idx = x_row * x_stride + s_column;
while (x_idx < x_idx_end)
{
x_new[x_idx] = x[s_idx];
x_idx += x_stride;
s_idx += x_stride;
}
}
// Remap columns in x to correspond to sequential group index before matmul
//
// perform x -> seq_x such that seq_x @ seq_w == x @ w
void column_remap_cuda
(
const half* x,
half* x_new,
const int x_height,
const int x_width,
const uint32_t* x_map
)
{
dim3 threads(SHUF_BLOCKSIZE_X, 1, 1);
dim3 blocks
(
(x_width + SHUF_BLOCKSIZE_X - 1) / SHUF_BLOCKSIZE_X,
(x_height + SHUF_BLOCKSIZE_Y - 1) / SHUF_BLOCKSIZE_Y,
1
);
column_remap_kernel<<<blocks, threads>>>(x, x_new, x_width, x_height, x_map);
}
// Adapted from turboderp exllama: https://github.com/turboderp/exllama
#ifndef _column_remap_cuh
#define _column_remap_cuh
#include <cuda_runtime.h>
#include <cuda_fp16.h>
#include <cstdint>
void column_remap_cuda
(
const half* x,
half* x_new,
const int x_height,
const int x_width,
const uint32_t* x_map
);
#endif
\ No newline at end of file
#include "q4_matmul.cuh"
#include "column_remap.cuh"
#include "../util.cuh"
#include "../matrix.cuh"
#include "../cuda_compat.cuh"
#include "../cuda_buffers.cuh"
const int THREADS_X = 32; // Block size and thread count along columns in w and out
const int THREADS_Y = 1; // Block size and thread count along rows in x and out
typedef void (*fp_q4_matmul_kernel)
(
const half*,
const uint32_t*,
half*,
const half*,
const uint32_t*,
const int,
const int,
const int,
const int,
const int,
const uint32_t*,
bool
);
template<bool use_half2, bool use_groupsize, bool use_x_map>
__global__ void q4_matmul_kernel
(
const half* __restrict__ x,
const uint32_t* __restrict__ w,
half* __restrict__ out,
const half* __restrict__ w_scales,
const uint32_t* __restrict__ w_zeros,
const int height,
const int dim,
const int width,
const int groupsize,
const int block_size_z,
const uint32_t* __restrict__ x_map,
bool no_zero
)
{
// Start of block
int x_column = block_size_z * blockIdx.z;
int x_column_end = min(dim, block_size_z * (blockIdx.z + 1));
int w_column = THREADS_X * blockIdx.x + threadIdx.x;
int x_row = THREADS_Y * blockIdx.y + threadIdx.y;
int iterations = (x_column_end - x_column) / 8;
// Views
MatrixView_half x_(x, height, dim);
MatrixView_half w_scales_(w_scales, dim / groupsize, width);
MatrixView_q4_row w_zeros_(w_zeros, dim / groupsize, width);
MatrixView_q4_column w_(w, dim, width);
MatrixView_half_rw out_(out, height, width);
// Zero output
if (!no_zero && blockIdx.z == 0 && (threadIdx.x & 1) == 0)
{
*((uint32_t*) out_.item_ptr(x_row, w_column)) = 0;
__syncthreads();
}
// Loop over part of x row (and w column)
half2 acc = {};
half acc_h = {};
if constexpr (use_groupsize)
{
// For quant matrices where groupsize divides BLOCK_SIZE_Z we always start on a group boundary, so this
// could be slightly faster
for (int k = x_column, group = x_column / groupsize; k < x_column + iterations * 8; group++, k += groupsize)
{
if constexpr (use_half2)
{
half2 w_scale = w_scales_.item_half2half2(group, w_column);
uint32_t w_zero = w_zeros_.item(group, w_column) + 1;
if constexpr (use_x_map) acc = dot_product_8_x_map(acc, x_, x_row, k, w_, k, w_column, w_scale, w_zero, groupsize / 8, x_map);
else acc = dot_product_8 (acc, x_, x_row, k, w_, k, w_column, w_scale, w_zero, groupsize / 8);
}
else
{
half w_scale = w_scales_.item(group, w_column);
uint32_t w_zero = w_zeros_.item(group, w_column) + 1;
if constexpr (use_x_map) acc_h = dot_product_8_x_map_h(acc_h, x_, x_row, k, w_, k, w_column, w_scale, w_zero, groupsize / 8, x_map);
else acc_h = dot_product_8_h (acc_h, x_, x_row, k, w_, k, w_column, w_scale, w_zero, groupsize / 8);
}
}
}
else
{
// Otherwise assume groupsize is a multiple of 8, do 8 columns per iteration and trust the cache
for (int k = x_column; k < x_column + iterations * 8; k += 8)
{
if constexpr (use_half2)
{
int group = k / groupsize;
half2 w_scale = w_scales_.item_half2half2(group, w_column);
uint32_t w_zero = w_zeros_.item(group, w_column) + 1;
if constexpr (use_x_map) acc = dot_product_8_x_map(acc, x_, x_row, k, w_, k, w_column, w_scale, w_zero, 1, x_map);
else acc = dot_product_8 (acc, x_, x_row, k, w_, k, w_column, w_scale, w_zero, 1);
}
else
{
int group = k / groupsize;
half w_scale = w_scales_.item(group, w_column);
uint32_t w_zero = w_zeros_.item(group, w_column) + 1;
if constexpr (use_x_map) acc_h = dot_product_8_x_map_h(acc_h, x_, x_row, k, w_, k, w_column, w_scale, w_zero, 1, x_map);
else acc_h = dot_product_8_h (acc_h, x_, x_row, k, w_, k, w_column, w_scale, w_zero, 1);
}
}
}
// Add to block result
if constexpr (use_half2)
{
half result = __hadd(acc.x, acc.y);
atomicAdd(out_.item_ptr(x_row, w_column), result);
}
else
{
atomicAdd(out_.item_ptr(x_row, w_column), acc_h);
}
}
fp_q4_matmul_kernel q4_matmul_kernel_pick(ExLlamaTuning* tuningParams, int block_size_z, int groupsize, uint32_t* x_map)
{
// <bool use_half2, bool use_groupsize, bool use_x_map>
if (tuningParams->matmul_no_half2) {
if (block_size_z % groupsize == 0) {
if (x_map) return q4_matmul_kernel<false, true, true >;
else return q4_matmul_kernel<false, true, false>;
} else {
if (x_map) return q4_matmul_kernel<false, false, true >;
else return q4_matmul_kernel<false, false, false>;
}
} else {
if (block_size_z % groupsize == 0)
{
if (x_map) return q4_matmul_kernel<true, true, true >;
else return q4_matmul_kernel<true, true, false>;
} else {
if (x_map) return q4_matmul_kernel<true, false, true >;
else return q4_matmul_kernel<true, false, false>;
}
}
};
// Compute y = x @ w
void q4_matmul_cuda
(
ExLlamaTuning* tuningParams,
const half* x,
const int x_height,
const Q4Matrix* w,
half* out,
bool no_zero,
cudaStream_t alt_stream
)
{
int height = x_height;
int dim = w->height;
int width = w->width;
cudaSetDevice(w->device);
uint32_t* x_map = w->cuda_x_map;
const half* x_mapped = x;
if (x_map && !tuningParams->matmul_fused_remap && !alt_stream)
{
CudaBuffers* buffers = get_buffers(w->device);
column_remap_cuda(x, buffers->temp_state, x_height, dim, w->cuda_x_map);
x_mapped = buffers->temp_state;
x_map = NULL;
}
int block_size_z;
if (w->width == 4096) block_size_z = 384; // 7B
else if (w->width == 11008) block_size_z = 256;
else if (w->width == 5120) block_size_z = 384; // 13B
else if (w->width == 13824) block_size_z = 256;
else if (w->width == 6656) block_size_z = 256; // 33B
else if (w->width == 17920) block_size_z = 128;
else block_size_z = 256;
//if (!no_zero) cudaMemsetAsync(out, 0, x_height * w->width * sizeof(half));
dim3 threads(THREADS_X, THREADS_Y, 1);
dim3 blocks
(
(width + threads.x - 1) / threads.x,
(height + threads.y - 1) / threads.y,
(dim + block_size_z - 1) / block_size_z
);
fp_q4_matmul_kernel kernel = q4_matmul_kernel_pick(tuningParams, block_size_z, w->groupsize, x_map);
kernel<<<blocks, threads, 0, alt_stream>>> (x_mapped, w->cuda_qweight, out, w->cuda_scales, w->cuda_qzeros, height, dim, width, w->groupsize, block_size_z, x_map, no_zero);
}
void q4_matmul_recons_cuda
(
ExLlamaTuning* tuningParams,
const half* x,
const int x_height,
Q4Matrix* w,
half* out,
const cublasHandle_t handle,
bool no_zero
)
{
int height = x_height;
int dim = w->height;
int width = w->width;
cudaSetDevice(w->device);
CudaBuffers* buffers = get_buffers(w->device);
const half* x_mapped = x;
if (w->cuda_x_map)
{
column_remap_cuda(x, buffers->temp_state, x_height, dim, w->cuda_x_map);
x_mapped = buffers->temp_state;
}
w->reconstruct(buffers->temp_dq);
const half alpha = __float2half(1.0f);
const half beta = no_zero ? __float2half(1.0f) : __float2half(0.0f);
cublasHgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, width, height, dim, &alpha, buffers->temp_dq, width, x_mapped, dim, &beta, out, width);
// const float alpha = 1.0f;
// const float beta = no_zero ? 1.0f : 0.0f;
// cublasSgemmEx(handle, CUBLAS_OP_N, CUBLAS_OP_N, width, height, dim, &alpha, buffers->temp_dq, CUDA_R_16F, width,
// x_mapped, CUDA_R_16F, dim, &beta, out, CUDA_R_16F, width);
}
// Adapted from turboderp exllama: https://github.com/turboderp/exllama
#ifndef _q4_matmul_cuh
#define _q4_matmul_cuh
#include <cuda_runtime.h>
#include <cuda_fp16.h>
#include <cstdint>
#include <cstdio>
#include <ATen/cuda/CUDAContext.h>
#include "q4_matrix.cuh"
#include "../tuning.h"
void q4_matmul_cuda
(
ExLlamaTuning* tuningParams,
const half* x,
const int x_height,
const Q4Matrix* w,
half* out,
bool no_zero = false,
cudaStream_t alt_stream = NULL
);
void q4_matmul_recons_cuda
(
ExLlamaTuning* tuningParams,
const half* x,
const int x_height,
Q4Matrix* w,
half* out,
const cublasHandle_t handle,
bool no_zero = false
);
#endif
// Adapted from turboderp exllama: https://github.com/turboderp/exllama
#include "q4_matrix.cuh"
#include <vector>
#include "../util.cuh"
#include "../matrix.cuh"
using namespace std;
const int UNSHUF_BLOCKSIZE_X = 64;
const int RECONS_THREADS_X = 64; // Block size and thread count along columns in out, each thread converts 1 column
const int RECONS_THREADS_Y = 1; // Block size and thread count along rows in x and out, each thread converts 8 rows
vector<Q4Matrix*> g_q4_matrices;
void g_q4_keep_matrix(Q4Matrix* m)
{
g_q4_matrices.push_back(m);
}
void g_q4_free_matrices()
{
for (const auto& m : g_q4_matrices) delete m;
g_q4_matrices.clear();
}
Q4Matrix::Q4Matrix
(
const int _height,
const int _width,
const int _groups,
uint32_t* _qweight,
uint32_t* _qzeros,
half* _scales,
uint32_t* _g_idx,
const int _device
) :
height(_height),
width(_width),
groups(_groups),
device(_device)
{
cudaSetDevice(device);
cuda_qweight = _qweight;
cuda_qzeros = _qzeros;
cuda_scales = _scales;
groupsize = height / groups;
if (_g_idx) make_sequential(_g_idx);
}
Q4Matrix::~Q4Matrix()
{
}
// Make sequential
__global__ void make_sequential_kernel
(
const uint32_t* __restrict__ w,
uint32_t* __restrict__ w_new,
const uint32_t* __restrict__ x_map,
const int w_height,
const int w_width
)
{
const uint64_t* w2 = (uint64_t*) w;
uint64_t* w_new2 = (uint64_t*) w_new;
int w2_stride = w_width >> 1;
int w2_column = UNSHUF_BLOCKSIZE_X * blockIdx.x + threadIdx.x;
int w_new2_row = blockIdx.y;
int x_map_idx = w_new2_row << 3;
uint64_t dst = 0;
#pragma unroll
for (int i = 0; i < 8; i++)
{
int source_row = x_map[x_map_idx++];
int w2_row = source_row >> 3;
int w2_subrow = source_row & 0x07;
int w2_row_shift = w2_subrow << 2;
int wnew2_row_shift = i << 2;
uint64_t src = w2[w2_row * w2_stride + w2_column];
src >>= w2_row_shift;
src &= 0x0000000f0000000f;
src <<= wnew2_row_shift;
dst |= src;
}
w_new2[w_new2_row * w2_stride + w2_column] = dst;
}
void Q4Matrix::make_sequential(const uint32_t* cpu_g_idx)
{
uint32_t* cuda_new_qweight = NULL;
cudaMalloc(&cuda_new_qweight, height / 8 * width * sizeof(uint32_t));
cudaMalloc(&cuda_x_map, height * sizeof(uint32_t)); // TODO: Should probably be allocated in PyTorch
uint32_t* cpu_g_idx_map = (uint32_t*) calloc(groups, sizeof(uint32_t));
uint32_t* cpu_x_map = (uint32_t*) malloc(height * sizeof(uint32_t));
uint32_t* cpu_x_map_inv = (uint32_t*) malloc(height * sizeof(uint32_t));
// Group histogram
for (int i = 0; i < height; i++) cpu_g_idx_map[cpu_g_idx[i]]++;
// Group map
for (int i = 0, acc = 0; i < groups; i++)
{
short tmp = cpu_g_idx_map[i];
cpu_g_idx_map[i] = acc;
acc += tmp;
}
// X map (inverse)
for (int row = 0; row < height; row++)
{
uint32_t target_group = cpu_g_idx[row];
uint32_t target_row = cpu_g_idx_map[target_group];
cpu_g_idx_map[target_group]++;
cpu_x_map_inv[row] = target_row;
}
// X map
for (int row = 0; row < height; row++) cpu_x_map[cpu_x_map_inv[row]] = row;
// Move to CUDA
cudaMemcpyAsync(cuda_x_map, cpu_x_map, height * sizeof(uint32_t), cudaMemcpyHostToDevice);
// Rearrange rows in w
dim3 threads(UNSHUF_BLOCKSIZE_X, 1, 1);
dim3 blocks(width / UNSHUF_BLOCKSIZE_X / 2, height / 8, 1);
make_sequential_kernel<<<blocks, threads>>>(cuda_qweight, cuda_new_qweight, cuda_x_map, height / 8, width);
// Replace qweights
cudaMemcpyAsync(cuda_qweight, cuda_new_qweight, height / 8 * width * sizeof(uint32_t), cudaMemcpyDeviceToDevice);
// Cleanup
cudaDeviceSynchronize();
cudaFree(cuda_new_qweight);
free(cpu_g_idx_map);
free(cpu_x_map);
free(cpu_x_map_inv);
}
__global__ void reconstruct_kernel
(
const uint32_t* __restrict__ w,
half* __restrict__ out, // (y)
const half* __restrict__ w_scales,
const uint32_t* __restrict__ w_zeros,
const int height,
const int width,
const int groupsize
)
{
// Start of block
int column = RECONS_THREADS_X * blockIdx.x + threadIdx.x;
int row = (RECONS_THREADS_Y * blockIdx.y + threadIdx.y) * 8;
// Views
MatrixView_q4_column w_(w, height, width);
MatrixView_half_rw out_(out, height, width);
MatrixView_half w_scales_(w_scales, height / groupsize, width);
MatrixView_q4_row w_zeros_(w_zeros, height / groupsize, width);
// Groupsize version
int group = row / groupsize;
half w_scale = w_scales_.item(group, column);
uint32_t w_zero = w_zeros_.item(group, column) + 1;
uint32_t w_read = w_.item_uint32_t(row, column);
half* out_ptr = out_.item_ptr(row, column);
#pragma unroll
for (int s = 0; s < 32; s += 4)
{
half w_item = __hmul(__int2half_rn((int)((w_read >> s) & 0x0f) - w_zero), w_scale);
*out_ptr = w_item; out_ptr += out_.width;
}
}
void Q4Matrix::reconstruct(half* out)
{
dim3 threads(RECONS_THREADS_X, RECONS_THREADS_Y, 1);
dim3 blocks
(
(width + threads.x - 1) / threads.x,
(height / 8 + threads.y - 1) / threads.y,
1
);
reconstruct_kernel<<<blocks, threads>>>(cuda_qweight, out, cuda_scales, cuda_qzeros, height / 8, width, groupsize);
}
\ No newline at end of file
// Adapted from turboderp exllama: https://github.com/turboderp/exllama
#ifndef _q4_matrix_cuh
#define _q4_matrix_cuh
#include <cuda_runtime.h>
#include <cuda_fp16.h>
#include <cstdint>
class Q4Matrix
{
public:
int device;
int height;
int width;
int groups;
int groupsize;
uint32_t* cuda_qweight = NULL;
uint32_t* cuda_qzeros = NULL;
half* cuda_scales = NULL;
uint32_t* cuda_x_map = NULL;
Q4Matrix
(
const int _height,
const int _width,
const int _groups,
uint32_t* _qweight,
uint32_t* _qzeros,
half* _scales,
uint32_t* _g_idx,
const int _device
);
~Q4Matrix();
void reconstruct(half* out);
private:
void make_sequential(const uint32_t* cpu_g_idx);
};
void g_q4_keep_matrix(Q4Matrix* m);
void g_q4_free_matrices();
#endif
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment