[Kernel/Quant] Remove AQLM (#22943)

Signed-off-by: mgoin <mgoin64@gmail.com> Co-authored-by: Wentao Ye <44945378+yewentao256@users.noreply.github.com>

[Kernel/Quant] Remove AQLM (#22943)
Signed-off-by: mgoin <mgoin64@gmail.com> Co-authored-by: Wentao Ye <44945378+yewentao256@users.noreply.github.com>
4fc722ec · Michael Goin · GitHub · 3253ae76 · 4fc722ec · 4fc722ec
Unverified Commit 4fc722ec authored Aug 16, 2025 by Michael Goin Committed by GitHub Aug 16, 2025
16 changed files
--- a/.buildkite/scripts/hardware_ci/run-amd-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-amd-test.sh
@@ -121,7 +121,6 @@ fi
 if [[ $commands == *" kernels/quantization"* ]]; then
  commands="${commands} \
  --ignore=kernels/quantization/test_int8_quant.py \
-  --ignore=kernels/quantization/test_aqlm.py \
  --ignore=kernels/quantization/test_machete_mm.py \
  --ignore=kernels/quantization/test_block_fp8.py \
  --ignore=kernels/quantization/test_block_int8.py \

--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -286,7 +286,6 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
  FetchContent_MakeAvailable(cutlass)
  list(APPEND VLLM_EXT_SRC
-    "csrc/quantization/aqlm/gemm_kernels.cu"
    "csrc/quantization/awq/gemm_kernels.cu"
    "csrc/permute_cols.cu"
    "csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu"

--- a/benchmarks/kernels/benchmark_aqlm.py
+++ b/benchmarks/kernels/benchmark_aqlm.py
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-import os
-import sys
-from typing import Optional
-import torch
-import torch.nn.functional as F
-from vllm import _custom_ops as ops
-from vllm.model_executor.layers.quantization.aqlm import (
-    dequantize_weight,
-    generic_dequantize_gemm,
-    get_int_dtype,
-    optimized_dequantize_gemm,
-)
-from vllm.utils import FlexibleArgumentParser
-os.environ["CUDA_VISIBLE_DEVICES"] = "0"
-def torch_mult(
-    # [..., in_features]
-    input: torch.Tensor,
-    weights: torch.Tensor,
-    # [num_out_groups, 1, 1, 1]
-    scales: torch.Tensor,
-) -> torch.Tensor:
-    output = F.linear(input, weights)
-    return output
-def dequant_out_scale(
-    # [..., in_features]
-    input: torch.Tensor,
-    # [num_out_groups, num_in_groups, num_codebooks]
-    codes: torch.IntTensor,
-    # [num_codebooks, codebook_size, out_group_size, in_group_size]
-    codebooks: torch.Tensor,
-    # [num_out_groups, 1, 1, 1]
-    scales: torch.Tensor,
-    output_partition_sizes: torch.IntTensor,
-    bias: Optional[torch.Tensor],
-) -> torch.Tensor:
-    weights = ops.aqlm_dequant(codes, codebooks, output_partition_sizes)
-    if bias is None:
-        output = F.linear(input, weights, bias)
-        orig_shape = output.shape
-        flattened_output = output.view(-1, output.size(-1))
-        f_scales = scales.view(-1, scales.shape[0])
-        b_scales = f_scales.expand(flattened_output.shape[0], -1)
-        flattened_output *= b_scales
-        return flattened_output.view(orig_shape)
-    else:
-        b_scales = scales.view(scales.shape[:-3] + (-1,)).expand(-1, weights.shape[1])
-        weights *= b_scales
-        return F.linear(input, weights, bias)
-def dequant_weight_scale(
-    # [..., in_features]
-    input: torch.Tensor,
-    # [num_out_groups, num_in_groups, num_codebooks]
-    codes: torch.IntTensor,
-    # [num_codebooks, codebook_size, out_group_size, in_group_size]
-    codebooks: torch.Tensor,
-    # [num_out_groups, 1, 1, 1]
-    scales: torch.Tensor,
-    output_partition_sizes: torch.IntTensor,
-    bias: Optional[torch.Tensor],
-) -> torch.Tensor:
-    weights = ops.aqlm_dequant(codes, codebooks, output_partition_sizes)
-    b_scales = scales.view(scales.shape[:-3] + (-1,)).expand(-1, weights.shape[1])
-    weights *= b_scales
-    return F.linear(input, weights, bias)
-def dequant_no_scale(
-    # [..., in_features]
-    input: torch.Tensor,
-    # [num_out_groups, num_in_groups, num_codebooks]
-    codes: torch.IntTensor,
-    # [num_codebooks, codebook_size, out_group_size, in_group_size]
-    codebooks: torch.Tensor,
-    # [num_out_groups, 1, 1, 1]
-    scales: torch.Tensor,
-    output_partition_sizes: torch.IntTensor,
-    bias: Optional[torch.Tensor],
-) -> torch.Tensor:
-    weights = ops.aqlm_dequant(codes, codebooks, output_partition_sizes)
-    return F.linear(input, weights, bias)
-# Compare the optimized 1x16 and 2x8 cuda decompression/dequant kernels against
-# the generic pytorch version.
-# Just visual comparison.
-def dequant_test(k: int, parts: torch.Tensor, nbooks: int, bits: int) -> None:
-    n = int(parts.sum().item())
-    device = torch.device("cuda:0")
-    code_range = (1 << bits) // 2
-    ingroups = 8
-    codes = torch.randint(
-        -code_range,
-        code_range,
-        size=(n, k // ingroups, nbooks),
-        dtype=get_int_dtype(bits),
-        device=device,
-    )
-    codebooks = torch.randn(
-        size=(parts.shape[0] * nbooks, 1 << bits, 1, 8),
-        dtype=torch.float16,
-        device=device,
-    )
-    count = 0
-    for index in range(16):
-        for i in range(8):
-            for book in range(nbooks):
-                codebooks[book, index, 0, i] = count * (10**book)
-            count += 1
-    print("codes shape", codes.shape)
-    for i in range(16):
-        for book in range(nbooks):
-            codes[0, i, book] = i
-            codes[0, -i, book] = i
-    weights = dequantize_weight(codes, codebooks, None)
-    weights2 = ops.aqlm_dequant(codes, codebooks, parts)
-    print("weights shape:", weights.shape)
-    print("weights2 shape:", weights2.shape)
-    print("weights are:", weights)
-    print("weights2 are:", weights2)
-    print("first 128 weights are", weights[0, 0:128].to(torch.int32))
-    print("first 128 weights2 are:", weights2[0, 0:128].to(torch.int32))
-    print("last 128 weights are", weights[0, -128:])
-    print("last 128 weights2 are:", weights2[0, -128:])
-def main():
-    parser = FlexibleArgumentParser(description="Benchmark aqlm performance.")
-    # Add arguments
-    parser.add_argument(
-        "--nbooks", type=int, default=1, help="Number of codebooks (default: 1)"
-    )
-    parser.add_argument(
-        "--bits",
-        type=int,
-        default=16,
-        help="Number of bits per code element (default: 16)",
-    )
-    parser.add_argument(
-        "--test",
-        type=bool,
-        default=False,
-        help="Run the decompression/dequant tester rather than benchmarking "
-        "(default: False)",
-    )
-    # Parse the arguments
-    args = parser.parse_args()
-    # Extract values
-    nbooks = args.nbooks
-    bits = args.bits
-    if args.test:
-        dequant_test(4096, torch.tensor((4096,)), nbooks, bits)
-        return
-    # Otherwise, benchmark.
-    methods = [
-        ops.aqlm_gemm,
-        dequant_out_scale,
-        generic_dequantize_gemm,
-        optimized_dequantize_gemm,
-        dequant_weight_scale,
-        torch_mult,
-        dequant_no_scale,
-    ]
-    filename = f"./aqlm_benchmark_{nbooks}x{bits}.csv"
-    print(f"writing benchmarks to file {filename}")
-    with open(filename, "w") as f:
-        sys.stdout = f
-        print("m | k | n | n parts", end="")
-        for method in methods:
-            print(f" | {method.__name__.replace('_', ' ')} (µs)", end="")
-        print("")
-        # These are reasonable prefill sizes.
-        ksandpartions = (
-            (4096, (4096, 4096, 4096)),
-            (4096, (4096,)),
-            (4096, (11008, 11008)),
-            (11008, (4096,)),
-        )
-        # reasonable ranges for m.
-        for m in [
-            1,
-            2,
-            4,
-            8,
-            10,
-            12,
-            14,
-            16,
-            24,
-            32,
-            48,
-            52,
-            56,
-            64,
-            96,
-            112,
-            128,
-            256,
-            512,
-            1024,
-            1536,
-            2048,
-            3072,
-            4096,
-        ]:
-            print(f"{m}", file=sys.__stdout__)
-            for ksp in ksandpartions:
-                run_grid(m, ksp[0], torch.tensor(ksp[1]), nbooks, bits, methods)
-        sys.stdout = sys.__stdout__
-def run_grid(m: int, k: int, parts: torch.Tensor, nbooks: int, bits: int, methods):
-    # I didn't see visible improvements from increasing these, but feel free :)
-    num_warmup_trials = 1
-    num_trials = 1
-    num_calls = 100
-    # warmup.
-    for method in methods:
-        for _ in range(num_warmup_trials):
-            run_timing(
-                num_calls=num_calls,
-                m=m,
-                k=k,
-                parts=parts,
-                nbooks=nbooks,
-                bits=bits,
-                method=method,
-            )
-    n = parts.sum().item()
-    print(f"{m} | {k} | {n} | {parts.tolist()}", end="")
-    for method in methods:
-        best_time_us = 1e20
-        for _ in range(num_trials):
-            kernel_dur_ms = run_timing(
-                num_calls=num_calls,
-                m=m,
-                k=k,
-                parts=parts,
-                nbooks=nbooks,
-                bits=bits,
-                method=method,
-            )
-            kernel_dur_us = 1000 * kernel_dur_ms
-            if kernel_dur_us < best_time_us:
-                best_time_us = kernel_dur_us
-        print(f" | {kernel_dur_us:.0f}", end="")
-    print("")
-def run_timing(
-    num_calls: int, m: int, k: int, parts: torch.Tensor, nbooks: int, bits: int, method
-) -> float:
-    n = int(parts.sum().item())
-    device = torch.device("cuda:0")
-    input = torch.randn((1, m, k), dtype=torch.float16, device=device)
-    code_range = (1 << bits) // 2
-    ingroups = 8
-    codes = torch.randint(
-        -code_range,
-        code_range,
-        size=(n, k // ingroups, nbooks),
-        dtype=get_int_dtype(bits),
-        device=device,
-    )
-    codebooks = torch.randn(
-        size=(parts.shape[0] * nbooks, 1 << bits, 1, 8),
-        dtype=torch.float16,
-        device=device,
-    )
-    scales = torch.randn(size=(n, 1, 1, 1), dtype=torch.float16, device=device)
-    # for comparison to just a pytorch mult.
-    weights = torch.randn((n, k), dtype=torch.float16, device=device)
-    start_event = torch.cuda.Event(enable_timing=True)
-    end_event = torch.cuda.Event(enable_timing=True)
-    start_event.record()
-    if method is torch_mult:
-        for i in range(num_calls):
-            torch_mult(input, weights, scales)
-    else:
-        for i in range(num_calls):
-            method(input, codes, codebooks, scales, parts, None)
-    end_event.record()
-    end_event.synchronize()
-    dur_ms = start_event.elapsed_time(end_event) / num_calls
-    return dur_ms
-if __name__ == "__main__":
-    sys.exit(main())
--- a/csrc/ops.h
+++ b/csrc/ops.h
@@ -154,15 +154,6 @@ void cutlass_mla_decode(torch::Tensor const& out, torch::Tensor const& q_nope,
 torch::Tensor get_cuda_view_from_cpu_tensor(torch::Tensor& cpu_tensor);
 #ifndef USE_ROCM
-torch::Tensor aqlm_gemm(const torch::Tensor& input, const torch::Tensor& codes,
-                        const torch::Tensor& codebooks,
-                        const torch::Tensor& scales,
-                        const std::vector<int64_t>& codebook_partition_sizes,
-                        const std::optional<torch::Tensor>& bias);
-torch::Tensor aqlm_dequant(
-    const torch::Tensor& codes, const torch::Tensor& codebooks,
-    const std::vector<int64_t>& codebook_partition_sizes);
 torch::Tensor awq_gemm(torch::Tensor _in_feats, torch::Tensor _kernel,
                       torch::Tensor _scaling_factors, torch::Tensor _zeros,

--- a/csrc/quantization/aqlm/gemm_kernels.cu
+++ b/csrc/quantization/aqlm/gemm_kernels.cu
--- a/csrc/torch_bindings.cpp
+++ b/csrc/torch_bindings.cpp
@@ -207,21 +207,6 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
  // Quantization ops
 #ifndef USE_ROCM
-  // Quantized GEMM for AQLM.
-  ops.def(
-      "aqlm_gemm(Tensor input, Tensor codes, Tensor codebooks, "
-      "Tensor scales, int[] codebook_partition_sizes, Tensor? bias) "
-      "-> Tensor",
-      {stride_tag});
-  ops.impl("aqlm_gemm", torch::kCUDA, &aqlm_gemm);
-  // Decompression method for AQLM.
-  ops.def(
-      "aqlm_dequant(Tensor codes, Tensor codebooks, "
-      "int[] codebook_partition_sizes) -> Tensor",
-      {stride_tag});
-  ops.impl("aqlm_dequant", torch::kCUDA, &aqlm_dequant);
  // Quantized GEMM for AWQ.
  ops.def(
      "awq_gemm(Tensor _in_feats, Tensor _kernel, Tensor _scaling_factors, "

--- a/docs/features/quantization/supported_hardware.md
+++ b/docs/features/quantization/supported_hardware.md
@@ -17,7 +17,6 @@ th {
 | INT8 (W8A8)           | ❌      | ✅︎       | ✅︎       | ✅︎    | ✅︎       | ❌         | ❌          | ❌         | ✅︎        | ✅︎          | ✅︎           |
 | FP8 (W8A8)            | ❌      | ❌       | ❌       | ✅︎    | ✅︎       | ✅︎         | ❌          | ❌         | ❌        | ✅︎          | ❌           |
 | BitBLAS (GPTQ)        | ✅︎      | ✅︎       | ✅︎       | ✅︎    | ✅︎       | ❌         | ❌          | ❌         | ❌        | ❌          | ❌           |
-| AQLM                  | ✅︎      | ✅︎       | ✅︎       | ✅︎    | ✅︎       | ❌         | ❌          | ❌         | ❌        | ❌          | ❌           |
 | bitsandbytes          | ✅︎      | ✅︎       | ✅︎       | ✅︎    | ✅︎       | ❌         | ❌          | ❌         | ❌        | ❌          | ❌           |
 | DeepSpeedFP           | ✅︎      | ✅︎       | ✅︎       | ✅︎    | ✅︎       | ❌         | ❌          | ❌         | ❌        | ❌          | ❌           |
 | GGUF                  | ✅︎      | ✅︎       | ✅︎       | ✅︎    | ✅︎       | ✅︎         | ❌          | ❌         | ❌        | ❌          | ❌           |

--- a/docs/mkdocs/hooks/generate_examples.py
+++ b/docs/mkdocs/hooks/generate_examples.py
@@ -24,7 +24,6 @@ def fix_case(text: str) -> str:
        "llm": "LLM",
        "mae": "MAE",
        "tpu": "TPU",
-        "aqlm": "AQLM",
        "gguf": "GGUF",
        "lora": "LoRA",
        "rlhf": "RLHF",

--- a/examples/offline_inference/basic/README.md
+++ b/examples/offline_inference/basic/README.md
@@ -52,20 +52,6 @@ Try it yourself with the following argument:
 ### Quantization
-#### AQLM
-vLLM supports models that are quantized using AQLM.
-Try one yourself by passing one of the following models to the `--model` argument:
- `ISTA-DASLab/Llama-2-7b-AQLM-2Bit-1x16-hf`
- `ISTA-DASLab/Llama-2-7b-AQLM-2Bit-2x8-hf`
- `ISTA-DASLab/Llama-2-13b-AQLM-2Bit-1x16-hf`
- `ISTA-DASLab/Mixtral-8x7b-AQLM-2Bit-1x16-hf`
- `BlackSamorez/TinyLlama-1_1B-Chat-v1_0-AQLM-2Bit-1x16-hf`
-> Some of these models are likely to be too large for a single GPU. You can split them across multiple GPUs by setting `--tensor-parallel-size` to the number of required GPUs.
 #### GGUF
 vLLM supports models that are quantized using GGUF.

--- a/tests/compile/test_full_graph.py
+++ b/tests/compile/test_full_graph.py
@@ -31,10 +31,6 @@ def models_list(*, all: bool = True, keywords: Optional[list[str]] = None):
    ]
    if all:
-        if is_quant_method_supported("aqlm"):
-            TEST_MODELS.append(("ISTA-DASLab/Llama-2-7b-AQLM-2Bit-1x16-hf", {
-                "quantization": "aqlm"
-            }))
        # TODO: figure out why this fails.
        if False and is_quant_method_supported("gguf"):  # noqa: SIM223

--- a/tests/kernels/quantization/test_aqlm.py
+++ b/tests/kernels/quantization/test_aqlm.py
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-import torch
-from tests.kernels.utils import opcheck
-from vllm import _custom_ops as ops  # noqa: F401
-def test_aqlm_dequant_opcheck():
-    codes = torch.randint(-32768,
-                          32767, (22016, 512, 1),
-                          device='cuda',
-                          dtype=torch.int16)
-    codebooks = torch.rand((2, 65536, 1, 8),
-                           device='cuda',
-                           dtype=torch.float16)
-    codebook_partition_sizes = [11008, 11008]
-    opcheck(torch.ops._C.aqlm_dequant,
-            (codes, codebooks, codebook_partition_sizes))
-def test_aqlm_gemm_opcheck():
-    input = torch.rand((4, 4096), device='cuda', dtype=torch.float16)
-    codes = torch.randint(-32768,
-                          32767, (12288, 512, 1),
-                          device='cuda',
-                          dtype=torch.int16)
-    codebooks = torch.rand((3, 65536, 1, 8),
-                           device='cuda',
-                           dtype=torch.float16)
-    scales = torch.rand((12288, 1, 1, 1), device='cuda', dtype=torch.float16)
-    codebook_partition_sizes = [4096, 4096, 4096]
-    bias = None
-    opcheck(torch.ops._C.aqlm_gemm,
-            (input, codes, codebooks, scales, codebook_partition_sizes, None))
-    opcheck(torch.ops._C.aqlm_gemm,
-            (input, codes, codebooks, scales, codebook_partition_sizes, bias))
--- a/tests/models/quantization/test_aqlm.py
+++ b/tests/models/quantization/test_aqlm.py
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-import pytest
-from tests.quantization.utils import is_quant_method_supported
-from vllm.platforms import current_platform
-# These ground truth generations were generated using `transformers==4.38.1
-# aqlm==1.1.0 torch==2.2.0`
-# and the below code:
-# ```python
-# from transformers import AutoTokenizer, AutoModelForCausalLM
-# model_id = "ISTA-DASLab/Llama-2-7b-AQLM-2Bit-1x16-hf"
-# quantized_model = AutoModelForCausalLM.from_pretrained(model_id,
-# torch_dtype="auto", device_map="cuda").cuda()
-# tokenizer = AutoTokenizer.from_pretrained(model_id)
-# outputs = []
-# for prompt in example_prompts:
-#     input_ids = tokenizer(prompt, return_tensors="pt")["input_ids"].to("cuda")
-#     hf_outputs = quantized_model.generate(input_ids, max_new_tokens=32)
-#     outputs.append(tokenizer.decode(hf_outputs[0][input_ids.shape[1]:]))
-# print(outputs)
-# ```
-ground_truth_generations = [
-    '\n### Features\n\n- **High-throughput**: v',
-    'The major milestones in the development of artificial intelligence from '
-    '195',
-    'Compare and contrast artificial intelligence with human intelligence in '
-    'terms of processing information. The',
-    'Explain the difference between supervised and unsupervised learning.'
-    '\nExplain',
-    'Write a short story about a robot that dreams for the first time. The',
-    'Analyze the impact of the COVID-19 pandemic on global economic',
-    'The Mona Lisa is a painting by Leonardo da Vinci, and it',
-    'The early bird catches the worm.\nThe early bird catches the'
-]
-@pytest.mark.skipif(not is_quant_method_supported("aqlm")
-                    or current_platform.is_rocm()
-                    or not current_platform.is_cuda(),
-                    reason="AQLM is not supported on this GPU type.")
-@pytest.mark.parametrize("model", ["ISTA-DASLab/Llama-2-7b-AQLM-2Bit-1x16-hf"])
-@pytest.mark.parametrize("dtype", ["half"])
-@pytest.mark.parametrize("max_tokens", [16])
-@pytest.mark.parametrize("num_logprobs", [1])
-def test_models(
-    vllm_runner,
-    example_prompts,
-    model: str,
-    dtype: str,
-    max_tokens: int,
-    num_logprobs: int,
-) -> None:
-    with vllm_runner(model, dtype=dtype) as vllm_model:
-        vllm_outputs = vllm_model.generate_greedy_logprobs(
-            example_prompts, max_tokens, num_logprobs)
-    # loop through the prompts to compare against the ground truth generations
-    for prompt_idx in range(len(example_prompts)):
-        vllm_output_ids, vllm_output_str, vllm_logprobs = vllm_outputs[
-            prompt_idx]
-        print("Prompt:          ", repr(example_prompts[prompt_idx]))
-        print("Reference output:", repr(ground_truth_generations[prompt_idx]))
-        print("Output output:   ", repr(vllm_output_str))
-        assert vllm_output_str == ground_truth_generations[prompt_idx]
--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
@@ -476,32 +476,6 @@ if hasattr(torch.ops._C, "gptq_marlin_24_gemm"):
                           dtype=input.dtype,
                           device=input.device).sum(0)
-    @register_fake("_C::aqlm_gemm")
-    def _aqlm_gemm_fake(input: torch.Tensor, codes: torch.Tensor,
-                        codebooks: torch.Tensor, scales: torch.Tensor,
-                        codebook_partition_sizes: list[int],
-                        bias: Optional[torch.Tensor]) -> torch.Tensor:
-        out_features = codes.size(0) * codebooks.size(2)
-        flat_input = input.reshape((-1, input.size(-1)))
-        flat_output = torch.empty((flat_input.size(0), out_features),
-                                  dtype=input.dtype,
-                                  device=input.device)
-        output_sizes = list(input.shape)
-        output_sizes.pop()
-        output_sizes.append(-1)
-        return flat_output.reshape(tuple(output_sizes))
-    @register_fake("_C::aqlm_dequant")
-    def _aqlm_dequant_fake(
-            codes: torch.Tensor, codebooks: torch.Tensor,
-            codebook_partition_sizes: list[int]) -> torch.Tensor:
-        in_features = codes.size(1) * 8
-        out_features = codes.size(0)
-        return torch.empty((out_features, in_features),
-                           dtype=codebooks.dtype,
-                           device=codebooks.device)
    @register_fake("_C::machete_mm")
    def machete_mm_fake(
        a: torch.Tensor,
@@ -957,21 +931,6 @@ def cutlass_fp4_moe_mm(out_tensors: torch.Tensor, a_tensors: torch.Tensor,
                                             sf_offsets)
-# aqlm
-def aqlm_gemm(input: torch.Tensor, codes: torch.Tensor,
-              codebooks: torch.Tensor, scales: torch.Tensor,
-              codebook_partition_sizes: list[int],
-              bias: Optional[torch.Tensor]) -> torch.Tensor:
-    return torch.ops._C.aqlm_gemm(input, codes, codebooks, scales,
-                                  codebook_partition_sizes, bias)
-def aqlm_dequant(codes: torch.Tensor, codebooks: torch.Tensor,
-                 codebook_partition_sizes: list[int]) -> torch.Tensor:
-    return torch.ops._C.aqlm_dequant(codes, codebooks,
-                                     codebook_partition_sizes)
 # gptq_marlin
 def gptq_marlin_repack(b_q_weight: torch.Tensor, perm: torch.Tensor,
                       size_k: int, size_n: int,

--- a/vllm/model_executor/layers/linear.py
+++ b/vllm/model_executor/layers/linear.py
@@ -692,8 +692,6 @@ class MergedColumnParallelLinear(ColumnParallelLinear):
        param_data = param.data
        output_dim = getattr(param, "output_dim", None)
-        # Special case for AQLM codebooks.
-        is_metadata = getattr(param, "is_metadata", False)
        # Special case for per-tensor scale to load scalar into fused array.
        needs_scalar_to_array = getattr(param, "needs_scalar_to_array", False)
@@ -781,13 +779,6 @@ class MergedColumnParallelLinear(ColumnParallelLinear):
            if not is_sharded_weight:
                loaded_weight = loaded_weight.narrow(output_dim, start_idx,
                                                     shard_size)
-        # Special case for AQLM codebooks.
-        elif is_metadata:
-            # metadata indicates fixed size concatenated along dim 0
-            shard_size = loaded_weight.shape[0]
-            shard_offset = loaded_shard_id * shard_size
-            param_data = param_data.narrow(0, shard_offset, shard_size)
        # Special case for per-tensor scales in fused case.
        elif needs_scalar_to_array:
            param_data, loaded_weight = adjust_scalar_to_fused_array(
@@ -1081,8 +1072,6 @@ class QKVParallelLinear(ColumnParallelLinear):
        param_data = param.data
        output_dim = getattr(param, "output_dim", None)
-        # Special case for AQLM codebooks.
-        is_metadata = getattr(param, "is_metadata", False)
        # Special case for per-tensor scales in fused case.
        needs_scalar_to_array = getattr(param, "needs_scalar_to_array", False)
@@ -1204,13 +1193,6 @@ class QKVParallelLinear(ColumnParallelLinear):
                loaded_weight = loaded_weight.narrow(output_dim, start_idx,
                                                     shard_size)
-        # Special case for for AQLM codebooks.
-        elif is_metadata:
-            # metadata indicates fixed size concatenated along dim 0
-            shard_size = loaded_weight.shape[0]
-            shard_index = ["q", "k", "v"].index(loaded_shard_id)
-            param_data = param_data.narrow(0, shard_index * shard_size,
-                                           shard_size)
        # Special case for per-tensor scales in fused case.
        elif needs_scalar_to_array:
            param_data, loaded_weight = adjust_scalar_to_fused_array(

--- a/vllm/model_executor/layers/quantization/__init__.py
+++ b/vllm/model_executor/layers/quantization/__init__.py
@@ -7,7 +7,6 @@ from vllm.model_executor.layers.quantization.base_config import (
    QuantizationConfig)
 QuantizationMethods = Literal[
-    "aqlm",
    "awq",
    "deepspeedfp",
    "tpu_int8",
@@ -88,7 +87,6 @@ def get_quantization_config(quantization: str) -> type[QuantizationConfig]:
    # lazy import to avoid triggering `torch.compile` too early
    from vllm.model_executor.layers.quantization.quark.quark import QuarkConfig
-    from .aqlm import AQLMConfig
    from .auto_round import AutoRoundConfig
    from .awq import AWQConfig
    from .awq_marlin import AWQMarlinConfig
@@ -120,7 +118,6 @@ def get_quantization_config(quantization: str) -> type[QuantizationConfig]:
    from .tpu_int8 import Int8TpuConfig
    method_to_config: dict[str, type[QuantizationConfig]] = {
-        "aqlm": AQLMConfig,
        "awq": AWQConfig,
        "deepspeedfp": DeepSpeedFPConfig,
        "tpu_int8": Int8TpuConfig,

--- a/vllm/model_executor/layers/quantization/aqlm.py
+++ b/vllm/model_executor/layers/quantization/aqlm.py
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-# Supports AQLM compression, see https://github.com/Vahe1994/AQLM
-# and https://arxiv.org/pdf/2401.06118.pdf
-import math
-from typing import Any, Optional
-import torch
-import torch.nn.functional as F
-from torch.nn.parameter import Parameter
-from vllm import _custom_ops as ops
-from vllm.model_executor.layers.linear import LinearBase, LinearMethodBase
-from vllm.model_executor.layers.quantization import QuantizationMethods
-from vllm.model_executor.layers.quantization.base_config import (
-    QuantizationConfig)
-from vllm.model_executor.utils import set_weight_attrs
-def get_int_dtype(nbits: int) -> torch.dtype:
-    if nbits <= 8:
-        return torch.int8
-    if nbits <= 16:
-        return torch.int16
-    if nbits <= 32:
-        return torch.int32
-    if nbits <= 64:
-        return torch.int64
-    raise ValueError(f"No dtype available for {nbits}-bit codebooks")
-@torch.inference_mode()
-def unpack_int_data(data: torch.IntTensor, nbits: int) -> torch.IntTensor:
-    return data.to(torch.int64) % (2**nbits)
-def dequantize_weight(codes: torch.Tensor,
-                      codebooks: torch.Tensor,
-                      scales: Optional[torch.Tensor] = None) -> torch.Tensor:
-    """
-    Decode float weights from quantization codes. Differentiable.
-    :param codes: tensor of integer quantization codes, shape 
-        [*dims, num_out_groups, num_in_groups, num_codebooks]
-    :param codebooks: tensor of vectors for each quantization code, 
-        [num_codebooks, codebook_size, out_group_size, in_group_size]
-    :param scales: weight will be multiplied by this factor, must be 
-        broadcastble with 
-        [*dims, out_groups, num_in_groups, out_group_size, in_group_size]
-    :return: reconstructed weight tensor of shape 
-        [*dims, num_in_groups*group_size]
-    """
-    num_out_groups, num_in_groups, num_codebooks = codes.shape[-3:]
-    num_codebooks, codebook_size, out_group_size, in_group_size = \
-        codebooks.shape
-    out_features = num_out_groups * out_group_size
-    in_features = num_in_groups * in_group_size
-    codebook_offsets = torch.arange(
-        0, num_codebooks * codebook_size, codebook_size,
-        device=codes.device)  # shape: [num_codebooks]
-    reconstructed_weight_flat = F.embedding_bag(
-        codes.flatten(0, -2) + codebook_offsets,
-        codebooks.flatten(0, 1).flatten(-2, -1),
-        mode="sum"
-    )  # [prod(dims) * num_out_groups * num_in_groups, out_group_size
-    # * in_group_size]
-    reconstructed_weight_groupwise = reconstructed_weight_flat.view(
-        list(codes.shape[:-3]) +
-        [num_out_groups, num_in_groups, out_group_size, in_group_size])
-    if scales is not None:
-        reconstructed_weight_groupwise = reconstructed_weight_groupwise.mul(
-            scales)
-    return reconstructed_weight_groupwise.swapaxes(
-        -3, -2).reshape(list(codes.shape[:-3]) + [out_features, in_features])
-def dequantize_gemm(
-    input: torch.Tensor,  #  [..., in_features]
-    codes: torch.IntTensor,  #  [num_out_groups, num_in_groups, num_codebooks]
-    codebooks: torch.
-    Tensor,  #  [num_codebooks, codebook_size, out_group_size, in_group_size]
-    scales: torch.Tensor,  #  [num_out_groups, 1, 1, 1]
-    bias: Optional[torch.Tensor],
-) -> torch.Tensor:
-    dequantized_weight = dequantize_weight(
-        unpack_int_data(codes, codebooks.shape[1].bit_length() - 1),
-        codebooks,
-        scales,
-    )
-    return F.linear(input, dequantized_weight, bias)
-# Generic dequantization, slow but flexible.
-def generic_dequantize_gemm(
-    input: torch.Tensor,  #  [..., in_features]
-    codes: torch.IntTensor,  #  [num_out_groups, num_in_groups, num_codebooks]
-    codebooks: torch.
-    Tensor,  #  [num_codebooks, codebook_size, out_group_size, in_group_size]
-    scales: torch.Tensor,  #  [num_out_groups, 1, 1, 1]
-    output_partition_sizes: list[int],
-    bias: Optional[torch.Tensor],
-) -> torch.Tensor:
-    output_shape = input.shape[:-1] + (scales.shape[0], )
-    output = torch.empty(output_shape, dtype=input.dtype, device=input.device)
-    num_outputs = len(output_partition_sizes)
-    # break the inputs and codebooks apart then combine the outputs.
-    # Surprisingly (to me) this is faster than doing 3 de-quants and 1 big
-    # multiply at the end.
-    num_codebooks = codebooks.shape[0] // num_outputs
-    assert (scales.shape[0] == codes.shape[0])
-    assert (sum(output_partition_sizes) == scales.shape[0])
-    output_offset = 0
-    codebooks_offset = 0
-    for output_size in output_partition_sizes:
-        shard_output = dequantize_gemm(
-            input, codes.narrow(0, output_offset, output_size),
-            codebooks.narrow(0, codebooks_offset, num_codebooks),
-            scales.narrow(0, output_offset, output_size), None
-            if bias is None else bias.narrow(0, output_offset, output_size))
-        output_slice = output.narrow(-1, output_offset, output_size)
-        assert (output_slice.shape == shard_output.shape)
-        output_slice.copy_(shard_output)
-        output_offset += output_size
-        codebooks_offset += num_codebooks
-    return output
-# Optimized dequnantize/decompression kernels, supports 1x16 and 2x8
-# at 6 and 9 times faster than the generic version above, respectively.
-def optimized_dequantize_gemm(
-    input: torch.Tensor,  #  [..., in_features]
-    codes: torch.IntTensor,  #  [num_out_groups, num_in_groups, num_codebooks]
-    codebooks: torch.
-    Tensor,  #  [num_codebooks, codebook_size, out_group_size, in_group_size]
-    scales: torch.Tensor,  #  [num_out_groups, 1, 1, 1]
-    output_partition_sizes: list[int],
-    bias: Optional[torch.Tensor],
-) -> torch.Tensor:
-    weights = ops.aqlm_dequant(codes, codebooks, output_partition_sizes)
-    if bias is None:
-        # scaling the output is fastest, so we do that when possible.
-        output = F.linear(input, weights, bias)
-        orig_shape = output.shape
-        flattened_output = output.view(-1, output.size(-1))
-        f_scales = scales.view(-1, scales.shape[0])
-        b_scales = f_scales.expand(flattened_output.shape[0], -1)
-        flattened_output *= b_scales
-        return output.view(orig_shape)
-    else:
-        b_scales = scales.view(scales.shape[:-3] + (-1, )).expand(
-            -1, weights.shape[1])
-        weights *= b_scales
-        return F.linear(input, weights, bias)
-class AQLMConfig(QuantizationConfig):
-    """Config class for AQLM.
-    Reference: https://github.com/Vahe1994/AQLM
-    """
-    def __init__(
-        self,
-        in_group_size: int,
-        nbits_per_codebook: int,
-        num_codebooks: int,
-        out_group_size: int,
-    ) -> None:
-        super().__init__()
-        self.in_group_size = in_group_size
-        self.nbits_per_codebook = nbits_per_codebook
-        self.num_codebooks = num_codebooks
-        self.out_group_size = out_group_size
-        # out_group_size > 1 is untested, and probably won't work as-is.
-        assert (self.out_group_size == 1)
-        self.pack_factor = (self.in_group_size * self.out_group_size)
-    def __repr__(self) -> str:
-        return (f"AQLMConfig(in_group_size={self.in_group_size}, "
-                f"nbits_per_codebook={self.nbits_per_codebook}, "
-                f"num_codebooks={self.num_codebooks}, "
-                f"out_group_size={self.out_group_size})")
-    @classmethod
-    def get_name(cls) -> QuantizationMethods:
-        return "aqlm"
-    @classmethod
-    def get_supported_act_dtypes(cls) -> list[torch.dtype]:
-        return [torch.half]
-    @classmethod
-    def get_min_capability(cls) -> int:
-        return 60
-    @classmethod
-    def get_config_filenames(cls) -> list[str]:
-        return []  # no extra configs.
-    @classmethod
-    def from_config(cls, config: dict[str, Any]) -> "AQLMConfig":
-        in_group_size = cls.get_from_keys(config, ["in_group_size"])
-        nbits_per_codebook = cls.get_from_keys(config, ["nbits_per_codebook"])
-        num_code_books = cls.get_from_keys(config, ["num_codebooks"])
-        out_group_size = cls.get_from_keys(config, ["out_group_size"])
-        return cls(in_group_size, nbits_per_codebook, num_code_books,
-                   out_group_size)
-    def get_quant_method(self, layer: torch.nn.Module,
-                         prefix: str) -> Optional["AQLMLinearMethod"]:
-        if isinstance(layer, LinearBase):
-            return AQLMLinearMethod(self)
-        return None
-class AQLMLinearMethod(LinearMethodBase):
-    """Linear method for AQLM.
-    Args:
-        quant_config: The AQLM quantization config.
-    """
-    def __init__(self, quant_config: AQLMConfig):
-        self.quant_config = quant_config
-    def create_weights(self, layer: torch.nn.Module,
-                       input_size_per_partition: int,
-                       output_partition_sizes: list[int], input_size: int,
-                       output_size: int, params_dtype: torch.dtype,
-                       **extra_weight_attrs):
-        del output_size  # Unused.
-        del input_size  # Unused.
-        if params_dtype != torch.half:
-            raise ValueError("Only half is currently supported by aqlm")
-        if input_size_per_partition % self.quant_config.in_group_size != 0:
-            raise ValueError(
-                "The input size is not aligned with the quantized "
-                "weight shape. This can be caused by too large "
-                "tensor parallel size.")
-        output_size_per_partition = sum(output_partition_sizes)
-        if output_size_per_partition % self.quant_config.out_group_size != 0:
-            raise ValueError(
-                "The output size is not aligned with the quantized "
-                "weight shape. This can be caused by too large "
-                "tensor parallel size.")
-        codes = Parameter(
-            torch.empty(
-                # There could actually be two pack factors, one along input and
-                # one along output, but we don't currently support
-                # out_group_size, and only the one along output needs to be
-                # marked with "packed_dim" in order for QKVLinear to work.
-                output_size_per_partition,
-                input_size_per_partition // self.quant_config.pack_factor,
-                self.quant_config.num_codebooks,
-                dtype=get_int_dtype(self.quant_config.nbits_per_codebook),
-            ),
-            requires_grad=False,
-        )
-        set_weight_attrs(
-            codes,
-            {
-                "input_dim": 1,
-                "output_dim": 0,
-                "packed_dim": 1,
-                "pack_factor": self.quant_config.pack_factor,
-            },
-        )
-        codebooks = Parameter(
-            torch.empty(
-                self.quant_config.num_codebooks * len(output_partition_sizes),
-                2**self.quant_config.nbits_per_codebook,
-                self.quant_config.out_group_size,
-                self.quant_config.in_group_size,
-                dtype=params_dtype,
-            ),
-            requires_grad=False,
-        )
-        set_weight_attrs(
-            codebooks,
-            {
-                # metadata indicates fixed size concatenated along dim 0
-                "is_metadata": True,
-                "output_partition_sizes": output_partition_sizes
-            },
-        )
-        scales = Parameter(
-            torch.empty(
-                (
-                    output_size_per_partition //
-                    self.quant_config.out_group_size,
-                    1,
-                    1,
-                    1,
-                ),
-                dtype=params_dtype,
-            ),
-            requires_grad=False,
-        )
-        set_weight_attrs(
-            scales,
-            {
-                "output_dim": 0,
-                "packed_dim": 0,
-                "pack_factor": self.quant_config.out_group_size
-            },
-        )
-        layer.register_parameter("codes", codes)
-        set_weight_attrs(codes, extra_weight_attrs)
-        layer.register_parameter("codebooks", codebooks)
-        set_weight_attrs(codebooks, extra_weight_attrs)
-        layer.register_parameter("scales", scales)
-        set_weight_attrs(scales, extra_weight_attrs)
-    def apply(
-        self,
-        layer: torch.nn.Module,
-        x: torch.Tensor,
-        bias: Optional[torch.Tensor] = None,
-    ) -> torch.Tensor:
-        codebooks = layer.codebooks
-        codes = layer.codes
-        scales = layer.scales
-        output_partition_sizes = getattr(codebooks, "output_partition_sizes",
-                                         [])
-        nbooks = codes.shape[2]
-        ingroups = codebooks.shape[3]
-        outgroups = codebooks.shape[2]
-        bits = codebooks.shape[1]
-        # We support these formats with dedicated gemm and decompression
-        # kernels.
-        if ingroups == 8 and outgroups == 1 and (
-            (bits == 256 and nbooks == 2) or (bits == 65536 and nbooks == 1)):
-            # thresholds determined by timings on an A6000, one GPU
-            use_gemv = math.prod(x.shape[:-1]) <= 6
-            return ops.aqlm_gemm(
-                x,
-                codes,
-                codebooks,
-                scales,
-                output_partition_sizes,
-                bias,
-            ) if use_gemv else optimized_dequantize_gemm(
-                x,
-                codes,
-                codebooks,
-                scales,
-                output_partition_sizes,
-                bias,
-            )
-        # fall back all unoptimized formats
-        return generic_dequantize_gemm(
-            x,
-            codes,
-            codebooks,
-            scales,
-            output_partition_sizes,
-            bias,
-        )