Unverified Commit a7850008 authored by Daniël de Kok's avatar Daniël de Kok Committed by GitHub
Browse files

Add initial support for compressed-tensors checkpoints (#2732)

compressed-tensors is a safetensors extension for sparse, quantized
tensors. The format is more powerful than earlier AWQ/GPTQ/FP8
quantization, because

- Different quantizer configurations can be used for different targets.
- The format can specify input/output quantizers in addition to weight
  quantizers.
- Configurable exclusions for quantization.

This change adds a dependency on the `compressed-tensors` package for
its configuration parsing and layer matching functionality.

The following types of quantization are supported in this PR:

- W8A16 and W4A16 INT using GPTQ-Marlin kernels.
- W8A8 and W8A16 FP using FP8-Marlin and cutlass kernels.

Support for other quantization types will be added in subsequent PRs.
parent 97f7a22f
...@@ -247,7 +247,7 @@ COPY server/Makefile server/Makefile ...@@ -247,7 +247,7 @@ COPY server/Makefile server/Makefile
RUN cd server && \ RUN cd server && \
make gen-server && \ make gen-server && \
pip install -r requirements_cuda.txt && \ pip install -r requirements_cuda.txt && \
pip install ".[bnb, accelerate, marlin, moe, quantize, peft, outlines]" --no-cache-dir && \ pip install ".[bnb, accelerate, compressed-tensors, marlin, moe, quantize, peft, outlines]" --no-cache-dir && \
pip install nvidia-nccl-cu12==2.22.3 pip install nvidia-nccl-cu12==2.22.3
ENV LD_PRELOAD=/opt/conda/lib/python3.11/site-packages/nvidia/nccl/lib/libnccl.so.2 ENV LD_PRELOAD=/opt/conda/lib/python3.11/site-packages/nvidia/nccl/lib/libnccl.so.2
......
...@@ -296,7 +296,7 @@ COPY server/Makefile server/Makefile ...@@ -296,7 +296,7 @@ COPY server/Makefile server/Makefile
RUN cd server && \ RUN cd server && \
make gen-server && \ make gen-server && \
pip install -r requirements_rocm.txt && \ pip install -r requirements_rocm.txt && \
pip install ".[accelerate, peft, outlines]" --no-cache-dir pip install ".[accelerate, compressed-tensors, peft, outlines]" --no-cache-dir
# Install benchmarker # Install benchmarker
COPY --from=builder /usr/src/target/release-opt/text-generation-benchmark /usr/local/bin/text-generation-benchmark COPY --from=builder /usr/src/target/release-opt/text-generation-benchmark /usr/local/bin/text-generation-benchmark
......
...@@ -102,7 +102,7 @@ COPY server/Makefile server/Makefile ...@@ -102,7 +102,7 @@ COPY server/Makefile server/Makefile
RUN cd server && \ RUN cd server && \
make gen-server && \ make gen-server && \
pip install -r requirements_intel.txt && \ pip install -r requirements_intel.txt && \
pip install ".[accelerate, peft, outlines]" --no-cache-dir pip install ".[accelerate, compressed-tensors, peft, outlines]" --no-cache-dir
ENV CCL_ROOT=/opt/intel/oneapi/ccl/latest ENV CCL_ROOT=/opt/intel/oneapi/ccl/latest
ENV I_MPI_ROOT=/opt/intel/oneapi/mpi/latest ENV I_MPI_ROOT=/opt/intel/oneapi/mpi/latest
......
...@@ -63,6 +63,7 @@ Options: ...@@ -63,6 +63,7 @@ Options:
Possible values: Possible values:
- awq: 4 bit quantization. Requires a specific AWQ quantized model: <https://hf.co/models?search=awq>. Should replace GPTQ models wherever possible because of the better latency - awq: 4 bit quantization. Requires a specific AWQ quantized model: <https://hf.co/models?search=awq>. Should replace GPTQ models wherever possible because of the better latency
- compressed-tensors: Compressed tensors, which can be a mixture of different quantization methods
- eetq: 8 bit quantization, doesn't require specific model. Should be a drop-in replacement to bitsandbytes with much better performance. Kernels are from <https://github.com/NetEase-FuXi/EETQ.git> - eetq: 8 bit quantization, doesn't require specific model. Should be a drop-in replacement to bitsandbytes with much better performance. Kernels are from <https://github.com/NetEase-FuXi/EETQ.git>
- exl2: Variable bit quantization. Requires a specific EXL2 quantized model: <https://hf.co/models?search=exl2>. Requires exllama2 kernels and does not support tensor parallelism (num_shard > 1) - exl2: Variable bit quantization. Requires a specific EXL2 quantized model: <https://hf.co/models?search=exl2>. Requires exllama2 kernels and does not support tensor parallelism (num_shard > 1)
- gptq: 4 bit quantization. Requires a specific GTPQ quantized model: <https://hf.co/models?search=gptq>. text-generation-inference will use exllama (faster) kernels wherever possible, and use triton kernel (wider support) when it's not. AWQ has faster kernels - gptq: 4 bit quantization. Requires a specific GTPQ quantized model: <https://hf.co/models?search=gptq>. text-generation-inference will use exllama (faster) kernels wherever possible, and use triton kernel (wider support) when it's not. AWQ has faster kernels
......
...@@ -978,15 +978,16 @@ ...@@ -978,15 +978,16 @@
"nixpkgs": "nixpkgs_6" "nixpkgs": "nixpkgs_6"
}, },
"locked": { "locked": {
"lastModified": 1730724647, "lastModified": 1730795478,
"narHash": "sha256-SVv+50CGaCoU4zZwsg6ZAaOi/D5QJBL1P2SIB+3CEf4=", "narHash": "sha256-xpkXDKnkhXO4F6Ea3reHmqwXXRzQe2PsxdRQFPCViWs=",
"owner": "huggingface", "owner": "huggingface",
"repo": "text-generation-inference-nix", "repo": "text-generation-inference-nix",
"rev": "1512898a1e5ad9eff025205fa9c4d33a44506cf3", "rev": "b7f6c07867d94d6e55f5352573a6b3dad1c88e56",
"type": "github" "type": "github"
}, },
"original": { "original": {
"owner": "huggingface", "owner": "huggingface",
"ref": "compressed-tensors-0.7.1",
"repo": "text-generation-inference-nix", "repo": "text-generation-inference-nix",
"type": "github" "type": "github"
} }
......
...@@ -5,7 +5,7 @@ ...@@ -5,7 +5,7 @@
inputs.nixpkgs.follows = "tgi-nix/nixpkgs"; inputs.nixpkgs.follows = "tgi-nix/nixpkgs";
}; };
nix-filter.url = "github:numtide/nix-filter"; nix-filter.url = "github:numtide/nix-filter";
tgi-nix.url = "github:huggingface/text-generation-inference-nix"; tgi-nix.url = "github:huggingface/text-generation-inference-nix/compressed-tensors-0.7.1";
nixpkgs.follows = "tgi-nix/nixpkgs"; nixpkgs.follows = "tgi-nix/nixpkgs";
flake-utils.url = "github:numtide/flake-utils"; flake-utils.url = "github:numtide/flake-utils";
rust-overlay = { rust-overlay = {
......
{
"details": {
"best_of_sequences": null,
"finish_reason": "length",
"generated_tokens": 10,
"prefill": [
{
"id": 128000,
"logprob": null,
"text": "<|begin_of_text|>"
},
{
"id": 3923,
"logprob": -7.609375,
"text": "What"
},
{
"id": 374,
"logprob": -0.92529297,
"text": " is"
},
{
"id": 5655,
"logprob": -10.0,
"text": " deep"
},
{
"id": 6975,
"logprob": -0.94628906,
"text": " learning"
},
{
"id": 30,
"logprob": -2.9042969,
"text": "?"
}
],
"seed": null,
"tokens": [
{
"id": 18682,
"logprob": -0.8769531,
"special": false,
"text": " Deep"
},
{
"id": 6975,
"logprob": -0.0076942444,
"special": false,
"text": " learning"
},
{
"id": 374,
"logprob": -0.25073242,
"special": false,
"text": " is"
},
{
"id": 264,
"logprob": -0.097595215,
"special": false,
"text": " a"
},
{
"id": 955,
"logprob": -0.921875,
"special": false,
"text": " type"
},
{
"id": 315,
"logprob": -0.00027918816,
"special": false,
"text": " of"
},
{
"id": 21075,
"logprob": -0.5527344,
"special": false,
"text": " artificial"
},
{
"id": 11478,
"logprob": -0.042541504,
"special": false,
"text": " intelligence"
},
{
"id": 320,
"logprob": -0.38891602,
"special": false,
"text": " ("
},
{
"id": 15836,
"logprob": -0.0011043549,
"special": false,
"text": "AI"
}
],
"top_tokens": null
},
"generated_text": " Deep learning is a type of artificial intelligence (AI"
}
{
"details": {
"best_of_sequences": null,
"finish_reason": "length",
"generated_tokens": 10,
"prefill": [
{
"id": 128000,
"logprob": null,
"text": "<|begin_of_text|>"
},
{
"id": 3923,
"logprob": -7.609375,
"text": "What"
},
{
"id": 374,
"logprob": -0.92529297,
"text": " is"
},
{
"id": 5655,
"logprob": -10.0,
"text": " deep"
},
{
"id": 6975,
"logprob": -0.94628906,
"text": " learning"
}
],
"seed": 0,
"tokens": [
{
"id": 5380,
"logprob": -0.23840332,
"special": false,
"text": "?\n"
},
{
"id": 34564,
"logprob": 0.0,
"special": false,
"text": "Deep"
},
{
"id": 6975,
"logprob": 0.0,
"special": false,
"text": " learning"
},
{
"id": 11,
"logprob": 0.0,
"special": false,
"text": ","
},
{
"id": 1101,
"logprob": -1.2011719,
"special": false,
"text": " also"
},
{
"id": 3967,
"logprob": 0.0,
"special": false,
"text": " known"
},
{
"id": 439,
"logprob": 0.0,
"special": false,
"text": " as"
},
{
"id": 30828,
"logprob": 0.0,
"special": false,
"text": " neural"
},
{
"id": 4009,
"logprob": -0.6777344,
"special": false,
"text": " network"
},
{
"id": 477,
"logprob": 0.0,
"special": false,
"text": " or"
}
],
"top_tokens": null
},
"generated_text": "What is deep learning?\nDeep learning, also known as neural network or"
}
[
{
"details": {
"best_of_sequences": null,
"finish_reason": "length",
"generated_tokens": 10,
"prefill": [
{
"id": 128000,
"logprob": null,
"text": "<|begin_of_text|>"
},
{
"id": 3923,
"logprob": -7.609375,
"text": "What"
},
{
"id": 374,
"logprob": -0.92529297,
"text": " is"
},
{
"id": 5655,
"logprob": -10.0,
"text": " deep"
},
{
"id": 6975,
"logprob": -0.94628906,
"text": " learning"
},
{
"id": 30,
"logprob": -2.9042969,
"text": "?"
}
],
"seed": null,
"tokens": [
{
"id": 18682,
"logprob": -0.8769531,
"special": false,
"text": " Deep"
},
{
"id": 6975,
"logprob": -0.0076942444,
"special": false,
"text": " learning"
},
{
"id": 374,
"logprob": -0.25146484,
"special": false,
"text": " is"
},
{
"id": 264,
"logprob": -0.097595215,
"special": false,
"text": " a"
},
{
"id": 955,
"logprob": -0.9248047,
"special": false,
"text": " type"
},
{
"id": 315,
"logprob": -0.00027513504,
"special": false,
"text": " of"
},
{
"id": 21075,
"logprob": -0.5527344,
"special": false,
"text": " artificial"
},
{
"id": 11478,
"logprob": -0.043151855,
"special": false,
"text": " intelligence"
},
{
"id": 320,
"logprob": -0.3840332,
"special": false,
"text": " ("
},
{
"id": 15836,
"logprob": -0.0011043549,
"special": false,
"text": "AI"
}
],
"top_tokens": null
},
"generated_text": " Deep learning is a type of artificial intelligence (AI"
},
{
"details": {
"best_of_sequences": null,
"finish_reason": "length",
"generated_tokens": 10,
"prefill": [
{
"id": 128000,
"logprob": null,
"text": "<|begin_of_text|>"
},
{
"id": 3923,
"logprob": -7.6054688,
"text": "What"
},
{
"id": 374,
"logprob": -0.92089844,
"text": " is"
},
{
"id": 5655,
"logprob": -10.0,
"text": " deep"
},
{
"id": 6975,
"logprob": -0.94433594,
"text": " learning"
},
{
"id": 30,
"logprob": -2.90625,
"text": "?"
}
],
"seed": null,
"tokens": [
{
"id": 18682,
"logprob": -0.875,
"special": false,
"text": " Deep"
},
{
"id": 6975,
"logprob": -0.007698059,
"special": false,
"text": " learning"
},
{
"id": 374,
"logprob": -0.25268555,
"special": false,
"text": " is"
},
{
"id": 264,
"logprob": -0.09753418,
"special": false,
"text": " a"
},
{
"id": 955,
"logprob": -0.92529297,
"special": false,
"text": " type"
},
{
"id": 315,
"logprob": -0.00027942657,
"special": false,
"text": " of"
},
{
"id": 21075,
"logprob": -0.5527344,
"special": false,
"text": " artificial"
},
{
"id": 11478,
"logprob": -0.042541504,
"special": false,
"text": " intelligence"
},
{
"id": 320,
"logprob": -0.3840332,
"special": false,
"text": " ("
},
{
"id": 15836,
"logprob": -0.0011053085,
"special": false,
"text": "AI"
}
],
"top_tokens": null
},
"generated_text": " Deep learning is a type of artificial intelligence (AI"
},
{
"details": {
"best_of_sequences": null,
"finish_reason": "length",
"generated_tokens": 10,
"prefill": [
{
"id": 128000,
"logprob": null,
"text": "<|begin_of_text|>"
},
{
"id": 3923,
"logprob": -7.6054688,
"text": "What"
},
{
"id": 374,
"logprob": -0.92089844,
"text": " is"
},
{
"id": 5655,
"logprob": -10.0,
"text": " deep"
},
{
"id": 6975,
"logprob": -0.94433594,
"text": " learning"
},
{
"id": 30,
"logprob": -2.90625,
"text": "?"
}
],
"seed": null,
"tokens": [
{
"id": 18682,
"logprob": -0.875,
"special": false,
"text": " Deep"
},
{
"id": 6975,
"logprob": -0.007698059,
"special": false,
"text": " learning"
},
{
"id": 374,
"logprob": -0.25268555,
"special": false,
"text": " is"
},
{
"id": 264,
"logprob": -0.09753418,
"special": false,
"text": " a"
},
{
"id": 955,
"logprob": -0.92529297,
"special": false,
"text": " type"
},
{
"id": 315,
"logprob": -0.00027942657,
"special": false,
"text": " of"
},
{
"id": 21075,
"logprob": -0.5527344,
"special": false,
"text": " artificial"
},
{
"id": 11478,
"logprob": -0.042541504,
"special": false,
"text": " intelligence"
},
{
"id": 320,
"logprob": -0.3840332,
"special": false,
"text": " ("
},
{
"id": 15836,
"logprob": -0.0011053085,
"special": false,
"text": "AI"
}
],
"top_tokens": null
},
"generated_text": " Deep learning is a type of artificial intelligence (AI"
},
{
"details": {
"best_of_sequences": null,
"finish_reason": "length",
"generated_tokens": 10,
"prefill": [
{
"id": 128000,
"logprob": null,
"text": "<|begin_of_text|>"
},
{
"id": 3923,
"logprob": -7.6054688,
"text": "What"
},
{
"id": 374,
"logprob": -0.92089844,
"text": " is"
},
{
"id": 5655,
"logprob": -10.0,
"text": " deep"
},
{
"id": 6975,
"logprob": -0.94433594,
"text": " learning"
},
{
"id": 30,
"logprob": -2.90625,
"text": "?"
}
],
"seed": null,
"tokens": [
{
"id": 18682,
"logprob": -0.875,
"special": false,
"text": " Deep"
},
{
"id": 6975,
"logprob": -0.007698059,
"special": false,
"text": " learning"
},
{
"id": 374,
"logprob": -0.25268555,
"special": false,
"text": " is"
},
{
"id": 264,
"logprob": -0.09753418,
"special": false,
"text": " a"
},
{
"id": 955,
"logprob": -0.92529297,
"special": false,
"text": " type"
},
{
"id": 315,
"logprob": -0.00027942657,
"special": false,
"text": " of"
},
{
"id": 21075,
"logprob": -0.5527344,
"special": false,
"text": " artificial"
},
{
"id": 11478,
"logprob": -0.042541504,
"special": false,
"text": " intelligence"
},
{
"id": 320,
"logprob": -0.3840332,
"special": false,
"text": " ("
},
{
"id": 15836,
"logprob": -0.0011053085,
"special": false,
"text": "AI"
}
],
"top_tokens": null
},
"generated_text": " Deep learning is a type of artificial intelligence (AI"
}
]
{
"details": {
"best_of_sequences": null,
"finish_reason": "length",
"generated_tokens": 10,
"prefill": [
{
"id": 2,
"logprob": null,
"text": "<bos>"
},
{
"id": 1841,
"logprob": -5.46875,
"text": "What"
},
{
"id": 603,
"logprob": -0.69140625,
"text": " is"
},
{
"id": 5271,
"logprob": -12.0,
"text": " deep"
},
{
"id": 6044,
"logprob": -0.32226562,
"text": " learning"
},
{
"id": 235336,
"logprob": -0.33203125,
"text": "?"
}
],
"seed": null,
"tokens": [
{
"id": 109,
"logprob": -0.24707031,
"special": false,
"text": "\n\n"
},
{
"id": 26843,
"logprob": -0.14550781,
"special": false,
"text": "Deep"
},
{
"id": 6044,
"logprob": -0.038330078,
"special": false,
"text": " learning"
},
{
"id": 603,
"logprob": -0.029907227,
"special": false,
"text": " is"
},
{
"id": 476,
"logprob": -0.020996094,
"special": false,
"text": " a"
},
{
"id": 38397,
"logprob": -0.828125,
"special": false,
"text": " subset"
},
{
"id": 576,
"logprob": -0.00049209595,
"special": false,
"text": " of"
},
{
"id": 6479,
"logprob": -0.057373047,
"special": false,
"text": " machine"
},
{
"id": 6044,
"logprob": -0.000207901,
"special": false,
"text": " learning"
},
{
"id": 674,
"logprob": -0.15429688,
"special": false,
"text": " that"
}
],
"top_tokens": null
},
"generated_text": "\n\nDeep learning is a subset of machine learning that"
}
{
"details": {
"best_of_sequences": null,
"finish_reason": "length",
"generated_tokens": 10,
"prefill": [
{
"id": 2,
"logprob": null,
"text": "<bos>"
},
{
"id": 1841,
"logprob": -5.46875,
"text": "What"
},
{
"id": 603,
"logprob": -0.69140625,
"text": " is"
},
{
"id": 5271,
"logprob": -12.0,
"text": " deep"
},
{
"id": 6044,
"logprob": -0.32226562,
"text": " learning"
}
],
"seed": 0,
"tokens": [
{
"id": 235336,
"logprob": 0.0,
"special": false,
"text": "?"
},
{
"id": 109,
"logprob": 0.0,
"special": false,
"text": "\n\n"
},
{
"id": 26843,
"logprob": 0.0,
"special": false,
"text": "Deep"
},
{
"id": 14715,
"logprob": -0.38671875,
"special": false,
"text": " Learning"
},
{
"id": 603,
"logprob": 0.0,
"special": false,
"text": " is"
},
{
"id": 476,
"logprob": 0.0,
"special": false,
"text": " a"
},
{
"id": 38397,
"logprob": -0.12695312,
"special": false,
"text": " subset"
},
{
"id": 576,
"logprob": 0.0,
"special": false,
"text": " of"
},
{
"id": 6479,
"logprob": 0.0,
"special": false,
"text": " machine"
},
{
"id": 6044,
"logprob": 0.0,
"special": false,
"text": " learning"
}
],
"top_tokens": null
},
"generated_text": "What is deep learning?\n\nDeep Learning is a subset of machine learning"
}
[
{
"details": {
"best_of_sequences": null,
"finish_reason": "length",
"generated_tokens": 10,
"prefill": [
{
"id": 2,
"logprob": null,
"text": "<bos>"
},
{
"id": 1841,
"logprob": -5.46875,
"text": "What"
},
{
"id": 603,
"logprob": -0.69140625,
"text": " is"
},
{
"id": 5271,
"logprob": -12.0,
"text": " deep"
},
{
"id": 6044,
"logprob": -0.32226562,
"text": " learning"
},
{
"id": 235336,
"logprob": -0.33203125,
"text": "?"
}
],
"seed": null,
"tokens": [
{
"id": 109,
"logprob": -0.24707031,
"special": false,
"text": "\n\n"
},
{
"id": 26843,
"logprob": -0.14550781,
"special": false,
"text": "Deep"
},
{
"id": 6044,
"logprob": -0.03857422,
"special": false,
"text": " learning"
},
{
"id": 603,
"logprob": -0.030883789,
"special": false,
"text": " is"
},
{
"id": 476,
"logprob": -0.020996094,
"special": false,
"text": " a"
},
{
"id": 38397,
"logprob": -0.828125,
"special": false,
"text": " subset"
},
{
"id": 576,
"logprob": -0.00051498413,
"special": false,
"text": " of"
},
{
"id": 6479,
"logprob": -0.05883789,
"special": false,
"text": " machine"
},
{
"id": 6044,
"logprob": -0.00020694733,
"special": false,
"text": " learning"
},
{
"id": 674,
"logprob": -0.15820312,
"special": false,
"text": " that"
}
],
"top_tokens": null
},
"generated_text": "\n\nDeep learning is a subset of machine learning that"
},
{
"details": {
"best_of_sequences": null,
"finish_reason": "length",
"generated_tokens": 10,
"prefill": [
{
"id": 2,
"logprob": null,
"text": "<bos>"
},
{
"id": 1841,
"logprob": -5.46875,
"text": "What"
},
{
"id": 603,
"logprob": -0.71484375,
"text": " is"
},
{
"id": 5271,
"logprob": -12.0,
"text": " deep"
},
{
"id": 6044,
"logprob": -0.30859375,
"text": " learning"
},
{
"id": 235336,
"logprob": -0.3359375,
"text": "?"
}
],
"seed": null,
"tokens": [
{
"id": 109,
"logprob": -0.23828125,
"special": false,
"text": "\n\n"
},
{
"id": 26843,
"logprob": -0.14550781,
"special": false,
"text": "Deep"
},
{
"id": 6044,
"logprob": -0.038330078,
"special": false,
"text": " learning"
},
{
"id": 603,
"logprob": -0.030883789,
"special": false,
"text": " is"
},
{
"id": 476,
"logprob": -0.020996094,
"special": false,
"text": " a"
},
{
"id": 38397,
"logprob": -0.80859375,
"special": false,
"text": " subset"
},
{
"id": 576,
"logprob": -0.0005455017,
"special": false,
"text": " of"
},
{
"id": 6479,
"logprob": -0.05908203,
"special": false,
"text": " machine"
},
{
"id": 6044,
"logprob": -0.00020599365,
"special": false,
"text": " learning"
},
{
"id": 674,
"logprob": -0.17285156,
"special": false,
"text": " that"
}
],
"top_tokens": null
},
"generated_text": "\n\nDeep learning is a subset of machine learning that"
},
{
"details": {
"best_of_sequences": null,
"finish_reason": "length",
"generated_tokens": 10,
"prefill": [
{
"id": 2,
"logprob": null,
"text": "<bos>"
},
{
"id": 1841,
"logprob": -5.46875,
"text": "What"
},
{
"id": 603,
"logprob": -0.71484375,
"text": " is"
},
{
"id": 5271,
"logprob": -12.0,
"text": " deep"
},
{
"id": 6044,
"logprob": -0.30859375,
"text": " learning"
},
{
"id": 235336,
"logprob": -0.3359375,
"text": "?"
}
],
"seed": null,
"tokens": [
{
"id": 109,
"logprob": -0.23828125,
"special": false,
"text": "\n\n"
},
{
"id": 26843,
"logprob": -0.14550781,
"special": false,
"text": "Deep"
},
{
"id": 6044,
"logprob": -0.038330078,
"special": false,
"text": " learning"
},
{
"id": 603,
"logprob": -0.030883789,
"special": false,
"text": " is"
},
{
"id": 476,
"logprob": -0.020996094,
"special": false,
"text": " a"
},
{
"id": 38397,
"logprob": -0.80859375,
"special": false,
"text": " subset"
},
{
"id": 576,
"logprob": -0.0005455017,
"special": false,
"text": " of"
},
{
"id": 6479,
"logprob": -0.05908203,
"special": false,
"text": " machine"
},
{
"id": 6044,
"logprob": -0.00020599365,
"special": false,
"text": " learning"
},
{
"id": 674,
"logprob": -0.17285156,
"special": false,
"text": " that"
}
],
"top_tokens": null
},
"generated_text": "\n\nDeep learning is a subset of machine learning that"
},
{
"details": {
"best_of_sequences": null,
"finish_reason": "length",
"generated_tokens": 10,
"prefill": [
{
"id": 2,
"logprob": null,
"text": "<bos>"
},
{
"id": 1841,
"logprob": -5.46875,
"text": "What"
},
{
"id": 603,
"logprob": -0.71484375,
"text": " is"
},
{
"id": 5271,
"logprob": -12.0,
"text": " deep"
},
{
"id": 6044,
"logprob": -0.30859375,
"text": " learning"
},
{
"id": 235336,
"logprob": -0.3359375,
"text": "?"
}
],
"seed": null,
"tokens": [
{
"id": 109,
"logprob": -0.23828125,
"special": false,
"text": "\n\n"
},
{
"id": 26843,
"logprob": -0.14550781,
"special": false,
"text": "Deep"
},
{
"id": 6044,
"logprob": -0.038330078,
"special": false,
"text": " learning"
},
{
"id": 603,
"logprob": -0.030883789,
"special": false,
"text": " is"
},
{
"id": 476,
"logprob": -0.020996094,
"special": false,
"text": " a"
},
{
"id": 38397,
"logprob": -0.80859375,
"special": false,
"text": " subset"
},
{
"id": 576,
"logprob": -0.0005455017,
"special": false,
"text": " of"
},
{
"id": 6479,
"logprob": -0.05908203,
"special": false,
"text": " machine"
},
{
"id": 6044,
"logprob": -0.00020599365,
"special": false,
"text": " learning"
},
{
"id": 674,
"logprob": -0.17285156,
"special": false,
"text": " that"
}
],
"top_tokens": null
},
"generated_text": "\n\nDeep learning is a subset of machine learning that"
}
]
import pytest
@pytest.fixture(scope="module")
def compressed_tensors_w8an_handle(launcher):
with launcher(
"neuralmagic/Llama-3.2-1B-Instruct-FP8",
num_shard=2,
quantize="compressed-tensors",
) as handle:
yield handle
@pytest.fixture(scope="module")
async def compressed_tensors_w8an(compressed_tensors_w8an_handle):
await compressed_tensors_w8an_handle.health(300)
return compressed_tensors_w8an_handle.client
@pytest.mark.release
@pytest.mark.asyncio
@pytest.mark.private
async def test_compressed_tensors_w8an(compressed_tensors_w8an, response_snapshot):
response = await compressed_tensors_w8an.generate(
"What is deep learning?",
max_new_tokens=10,
decoder_input_details=True,
)
assert (
response.generated_text
== " Deep learning is a type of artificial intelligence (AI"
)
assert response.details.generated_tokens == 10
assert response == response_snapshot
@pytest.mark.asyncio
async def test_compressed_tensors_w8an_all_params(
compressed_tensors_w8an, response_snapshot
):
response = await compressed_tensors_w8an.generate(
"What is deep learning",
max_new_tokens=10,
repetition_penalty=1.2,
return_full_text=True,
stop_sequences=["test"],
temperature=0.5,
top_p=0.9,
top_k=10,
truncate=5,
typical_p=0.9,
watermark=True,
decoder_input_details=True,
seed=0,
)
assert response.details.generated_tokens == 10
assert (
response.generated_text
== "What is deep learning?\nDeep learning, also known as neural network or"
)
assert response == response_snapshot
@pytest.mark.release
@pytest.mark.asyncio
@pytest.mark.private
async def test_compressed_tensors_w8an_load(
compressed_tensors_w8an, generate_load, response_snapshot
):
responses = await generate_load(
compressed_tensors_w8an,
"What is deep learning?",
max_new_tokens=10,
n=4,
)
assert (
responses[0].generated_text
== " Deep learning is a type of artificial intelligence (AI"
)
assert len(responses) == 4
assert all([r.generated_text == responses[0].generated_text for r in responses])
assert responses == response_snapshot
import pytest
@pytest.fixture(scope="module")
def compressed_tensors_wna16_handle(launcher):
with launcher(
"neuralmagic/gemma-2-2b-it-quantized.w4a16",
num_shard=2,
quantize="compressed-tensors",
) as handle:
yield handle
@pytest.fixture(scope="module")
async def compressed_tensors_wna16(compressed_tensors_wna16_handle):
await compressed_tensors_wna16_handle.health(300)
return compressed_tensors_wna16_handle.client
@pytest.mark.release
@pytest.mark.asyncio
@pytest.mark.private
async def test_compressed_tensors_wna16(compressed_tensors_wna16, response_snapshot):
response = await compressed_tensors_wna16.generate(
"What is deep learning?",
max_new_tokens=10,
decoder_input_details=True,
)
assert (
response.generated_text
== "\n\nDeep learning is a subset of machine learning that"
)
assert response.details.generated_tokens == 10
assert response == response_snapshot
@pytest.mark.asyncio
async def test_compressed_tensors_wna16_all_params(
compressed_tensors_wna16, response_snapshot
):
response = await compressed_tensors_wna16.generate(
"What is deep learning",
max_new_tokens=10,
repetition_penalty=1.2,
return_full_text=True,
stop_sequences=["test"],
temperature=0.5,
top_p=0.9,
top_k=10,
truncate=5,
typical_p=0.9,
watermark=True,
decoder_input_details=True,
seed=0,
)
assert response.details.generated_tokens == 10
assert (
response.generated_text
== "What is deep learning?\n\nDeep Learning is a subset of machine learning"
)
assert response == response_snapshot
@pytest.mark.release
@pytest.mark.asyncio
@pytest.mark.private
async def test_compressed_tensors_wna16_load(
compressed_tensors_wna16, generate_load, response_snapshot
):
responses = await generate_load(
compressed_tensors_wna16,
"What is deep learning?",
max_new_tokens=10,
n=4,
)
assert (
responses[0].generated_text
== "\n\nDeep learning is a subset of machine learning that"
)
assert len(responses) == 4
assert all([r.generated_text == responses[0].generated_text for r in responses])
assert responses == response_snapshot
...@@ -212,6 +212,8 @@ enum Quantization { ...@@ -212,6 +212,8 @@ enum Quantization {
/// <https://hf.co/models?search=awq>. /// <https://hf.co/models?search=awq>.
/// Should replace GPTQ models wherever possible because of the better latency /// Should replace GPTQ models wherever possible because of the better latency
Awq, Awq,
/// Compressed tensors, which can be a mixture of different quantization methods.
CompressedTensors,
/// 8 bit quantization, doesn't require specific model. /// 8 bit quantization, doesn't require specific model.
/// Should be a drop-in replacement to bitsandbytes with much better performance. /// Should be a drop-in replacement to bitsandbytes with much better performance.
/// Kernels are from <https://github.com/NetEase-FuXi/EETQ.git> /// Kernels are from <https://github.com/NetEase-FuXi/EETQ.git>
...@@ -274,6 +276,9 @@ impl std::fmt::Display for Quantization { ...@@ -274,6 +276,9 @@ impl std::fmt::Display for Quantization {
Quantization::Awq => { Quantization::Awq => {
write!(f, "awq") write!(f, "awq")
} }
Quantization::CompressedTensors => {
write!(f, "compressed-tensors")
}
Quantization::Eetq => { Quantization::Eetq => {
write!(f, "eetq") write!(f, "eetq")
} }
......
...@@ -5,6 +5,7 @@ ...@@ -5,6 +5,7 @@
mypy-protobuf, mypy-protobuf,
awq-inference-engine, awq-inference-engine,
causal-conv1d, causal-conv1d,
compressed-tensors,
eetq, eetq,
einops, einops,
exllamav2, exllamav2,
...@@ -74,6 +75,7 @@ buildPythonPackage { ...@@ -74,6 +75,7 @@ buildPythonPackage {
awq-inference-engine awq-inference-engine
eetq eetq
causal-conv1d causal-conv1d
compressed-tensors
einops einops
exllamav2 exllamav2
flashinfer flashinfer
......
...@@ -23,7 +23,7 @@ gen-server: ...@@ -23,7 +23,7 @@ gen-server:
install-server: gen-server install-server: gen-server
pip install pip --upgrade pip install pip --upgrade
pip install -r requirements_cuda.txt pip install -r requirements_cuda.txt
pip install -e ".[accelerate, quantize, peft, outlines]" pip install -e ".[accelerate, compressed-tensors, quantize, peft, outlines]"
install: install-cuda install: install-cuda
......
# This file is automatically @generated by Poetry 1.8.3 and should not be changed by hand. # This file is automatically @generated by Poetry 1.8.4 and should not be changed by hand.
[[package]] [[package]]
name = "accelerate" name = "accelerate"
...@@ -388,6 +388,26 @@ files = [ ...@@ -388,6 +388,26 @@ files = [
{file = "colorama-0.4.6.tar.gz", hash = "sha256:08695f5cb7ed6e0531a20572697297273c47b8cae5a63ffc6d6ed5c201be6e44"}, {file = "colorama-0.4.6.tar.gz", hash = "sha256:08695f5cb7ed6e0531a20572697297273c47b8cae5a63ffc6d6ed5c201be6e44"},
] ]
[[package]]
name = "compressed-tensors"
version = "0.7.1"
description = "Library for utilization of compressed safetensors of neural network models"
optional = true
python-versions = "*"
files = [
{file = "compressed-tensors-0.7.1.tar.gz", hash = "sha256:3c7865ebfe4ea76ae94d7c674bcf93aedd2064571f682c09a377a219d5ebb3a0"},
{file = "compressed_tensors-0.7.1-py3-none-any.whl", hash = "sha256:22d11558a70f655ae647db9c8e9fb14a5e9d6983ca5aec3f267518625fd6dd0e"},
]
[package.dependencies]
pydantic = ">=2.0"
torch = ">=1.7.0"
transformers = "*"
[package.extras]
accelerate = ["accelerate"]
dev = ["black (==22.12.0)", "flake8 (>=3.8.3)", "isort (==5.8.0)", "nbconvert (>=7.16.3)", "pytest (>=6.0.0)", "wheel (>=0.36.2)"]
[[package]] [[package]]
name = "datasets" name = "datasets"
version = "2.21.0" version = "2.21.0"
...@@ -3982,4 +4002,4 @@ torch = ["torch"] ...@@ -3982,4 +4002,4 @@ torch = ["torch"]
[metadata] [metadata]
lock-version = "2.0" lock-version = "2.0"
python-versions = ">=3.9,<3.13" python-versions = ">=3.9,<3.13"
content-hash = "b39033e573f50a0f046787aebf1702d86673aad0b2fcee818404fcea7f644b81" content-hash = "4636689efd4c94559c3c23903aafcffd177533a3b9006b3b4f8491b158a3a754"
...@@ -37,6 +37,7 @@ pillow = "^10.0.0" ...@@ -37,6 +37,7 @@ pillow = "^10.0.0"
outlines= { version = "^0.0.34", optional = true } outlines= { version = "^0.0.34", optional = true }
prometheus-client = "^0.20.0" prometheus-client = "^0.20.0"
py-cpuinfo = "^9.0.0" py-cpuinfo = "^9.0.0"
compressed-tensors = { version = "^0.7.1", optional = true }
# Remove later, temporary workaround for outlines. # Remove later, temporary workaround for outlines.
numpy = "^1.26" numpy = "^1.26"
...@@ -58,6 +59,7 @@ rich = "^13.7.1" ...@@ -58,6 +59,7 @@ rich = "^13.7.1"
torch = ["torch"] torch = ["torch"]
accelerate = ["accelerate"] accelerate = ["accelerate"]
bnb = ["bitsandbytes"] bnb = ["bitsandbytes"]
compressed-tensors = ["compressed-tensors"]
marlin = ["marlin-kernels"] marlin = ["marlin-kernels"]
moe = ["moe-kernels"] moe = ["moe-kernels"]
peft = ["peft"] peft = ["peft"]
......
...@@ -19,6 +19,7 @@ class Quantization(str, Enum): ...@@ -19,6 +19,7 @@ class Quantization(str, Enum):
bitsandbytes_fp4 = "bitsandbytes-fp4" bitsandbytes_fp4 = "bitsandbytes-fp4"
gptq = "gptq" gptq = "gptq"
awq = "awq" awq = "awq"
compressed_tensors = "compressed-tensors"
eetq = "eetq" eetq = "eetq"
exl2 = "exl2" exl2 = "exl2"
fp8 = "fp8" fp8 = "fp8"
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment