[Kernel] Marlin Expansion: Support AutoGPTQ Models with Marlin (#3922)

Co-authored-by: alexm <alexm@neuralmagic.com> Co-authored-by: mgoin <michael@neuralmagic.com>

[Kernel] Marlin Expansion: Support AutoGPTQ Models with Marlin (#3922)
Co-authored-by: alexm <alexm@neuralmagic.com> Co-authored-by: mgoin <michael@neuralmagic.com>
73c8d677 · Robert Shaw · GitHub · df29793d · 73c8d677 · 73c8d677
Unverified Commit 73c8d677 authored Apr 29, 2024 by Robert Shaw Committed by GitHub Apr 29, 2024
14 changed files
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -177,6 +177,8 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
    "csrc/quantization/aqlm/gemm_kernels.cu"
    "csrc/quantization/awq/gemm_kernels.cu"
    "csrc/quantization/marlin/marlin_cuda_kernel.cu"
+    "csrc/quantization/gptq_marlin/gptq_marlin.cu"
+    "csrc/quantization/gptq_marlin/gptq_marlin_repack.cu"
    "csrc/custom_all_reduce.cu")
 endif()

--- a/csrc/ops.h
+++ b/csrc/ops.h
@@ -124,6 +124,24 @@ torch::Tensor marlin_gemm(
    int64_t size_m, 
    int64_t size_n, 
    int64_t size_k);
+torch::Tensor gptq_marlin_gemm(
+  torch::Tensor &a,
+  torch::Tensor &b_q_weight,
+  torch::Tensor &b_scales,
+  torch::Tensor &g_idx,
+  torch::Tensor &perm,
+  torch::Tensor &workspace,
+  int64_t size_m,
+  int64_t size_n,
+  int64_t size_k,
+  bool is_k_full);
+torch::Tensor gptq_marlin_repack(
+  torch::Tensor &b_q_weight,
+  torch::Tensor &perm,
+  int64_t size_k,
+  int64_t size_n);
 #endif
 void squeezellm_gemm(

--- a/csrc/pybind.cpp
+++ b/csrc/pybind.cpp
@@ -67,6 +67,8 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
  ops.def("aqlm_dequant", &aqlm_dequant, "Decompression method for AQLM");
  ops.def("awq_gemm", &awq_gemm, "Quantized GEMM for AWQ");
  ops.def("marlin_gemm", &marlin_gemm, "Marlin Optimized Quantized GEMM for GPTQ");
+  ops.def("gptq_marlin_gemm", &gptq_marlin_gemm, "gptq_marlin Optimized Quantized GEMM for GPTQ");
+  ops.def("gptq_marlin_repack", &gptq_marlin_repack, "gptq_marlin repack from GPTQ");
  ops.def("awq_dequantize", &awq_dequantize, "Dequantization for AWQ");
 #endif

--- a/csrc/quantization/gptq_marlin/gptq_marlin.cu
+++ b/csrc/quantization/gptq_marlin/gptq_marlin.cu
+/*
+ * Modified by Neural Magic
+ * Copyright (C) Marlin.2024 Elias Frantar
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *         http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * Adapted from https://github.com/IST-DASLab/marlin
+ */
+#include "gptq_marlin.cuh"
+template <typename T> inline std::string str(T x) { return std::to_string(x); }
+namespace gptq_marlin {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
+__global__ void permute_cols_kernel(int4 const *__restrict__ a_int4_ptr,
+                                    int const *__restrict__ perm_int_ptr,
+                                    int4 *__restrict__ out_int4_ptr, int size_m,
+                                    int size_k, int block_rows) {}
+template <const int threads,         // number of threads in a threadblock
+          const int thread_m_blocks, // number of 16x16 blocks in the m
+                                     // dimension (batchsize) of the threadblock
+          const int thread_n_blocks, // same for n dimension (output)
+          const int thread_k_blocks, // same for k dimension (reduction)
+          const int stages, // number of stages for the async global->shared
+                            // fetch pipeline
+          const bool has_act_order,   // whether act_order is enabled
+          const int group_blocks = -1 // number of consecutive 16x16 blocks with
+                                      // a separate quantization scale
+          >
+__global__ void
+Marlin(const int4 *__restrict__ A, // fp16 input matrix of shape mxk
+       const int4 *__restrict__ B, // 4bit quantized weight matrix of shape kxn
+       int4 *__restrict__ C,       // fp16 output buffer of shape mxn
+       const int4 *__restrict__ scales_ptr, // fp16 quantization scales of shape
+                                            // (k/groupsize)xn
+       const int *__restrict__ g_idx,       // int32 group indices of shape k
+       int num_groups, // number of scale groups per output channel
+       int prob_m,     // batch dimension m
+       int prob_n,     // output dimension n
+       int prob_k,     // reduction dimension k
+       int *locks      // extra global storage for barrier synchronization
+) {}
+} // namespace gptq_marlin
+torch::Tensor gptq_marlin_gemm(torch::Tensor &a, torch::Tensor &b_q_weight,
+                               torch::Tensor &b_scales, torch::Tensor &g_idx,
+                               torch::Tensor &perm, torch::Tensor &workspace,
+                               int64_t size_m, int64_t size_n, int64_t size_k,
+                               bool is_k_full) {
+  TORCH_CHECK_NOT_IMPLEMENTED(false,
+                              "marlin_gemm(..) requires CUDA_ARCH >= 8.0");
+  return torch::empty({1, 1});
+}
+#else
+// Matrix fragments for tensor core instructions; their precise layout is
+// documented here:
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#matrix-fragments-for-mma-m16n8k16-with-floating-point-type
+using FragA = Vec<half2, 4>;
+using FragB = Vec<half2, 2>;
+using FragC = Vec<float, 4>;
+using FragS = Vec<half2, 1>; // quantization scales
+// m16n8k16 tensor core mma instruction with fp16 inputs and fp32
+// output/accumulation.
+__device__ inline void mma(const FragA &a_frag, const FragB &frag_b,
+                           FragC &frag_c) {
+  const uint32_t *a = reinterpret_cast<const uint32_t *>(&a_frag);
+  const uint32_t *b = reinterpret_cast<const uint32_t *>(&frag_b);
+  float *c = reinterpret_cast<float *>(&frag_c);
+  asm volatile("mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 "
+               "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
+               : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3])
+               : "r"(a[0]), "r"(a[1]), "r"(a[2]), "r"(a[3]), "r"(b[0]),
+                 "r"(b[1]), "f"(c[0]), "f"(c[1]), "f"(c[2]), "f"(c[3]));
+}
+// Instruction for loading a full 16x16 matrix fragment of operand A from shared
+// memory, directly in tensor core layout.
+__device__ inline void ldsm4(FragA &frag_a, const void *smem_ptr) {
+  uint32_t *a = reinterpret_cast<uint32_t *>(&frag_a);
+  uint32_t smem = static_cast<uint32_t>(__cvta_generic_to_shared(smem_ptr));
+  asm volatile("ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%0,%1,%2,%3}, [%4];\n"
+               : "=r"(a[0]), "=r"(a[1]), "=r"(a[2]), "=r"(a[3])
+               : "r"(smem));
+}
+// Lookup-table based 3-input logical operation; explicitly used for
+// dequantization as the compiler does not seem to automatically recognize it in
+// all cases.
+template <int lut> __device__ inline int lop3(int a, int b, int c) {
+  int res;
+  asm volatile("lop3.b32 %0, %1, %2, %3, %4;\n"
+               : "=r"(res)
+               : "r"(a), "r"(b), "r"(c), "n"(lut));
+  return res;
+}
+// Efficiently dequantize an int32 value into a full B-fragment of 4 fp16
+// values. We mostly follow the strategy in the link below, with some small
+// changes:
+// https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/cutlass_extensions/include/cutlass_extensions/interleaved_numeric_conversion.h
+__device__ inline FragB dequant(int q) {
+  const int LO = 0x000f000f;
+  const int HI = 0x00f000f0;
+  const int EX = 0x64006400;
+  // Guarantee that the `(a & b) | c` operations are LOP3s.
+  int lo = lop3<(0xf0 & 0xcc) | 0xaa>(q, LO, EX);
+  int hi = lop3<(0xf0 & 0xcc) | 0xaa>(q, HI, EX);
+  // We want signed int4 outputs, hence we fuse the `-8` symmetric zero point
+  // directly into `SUB` and `ADD`.
+  const int SUB = 0x64086408;
+  const int MUL = 0x2c002c00;
+  const int ADD = 0xd480d480;
+  FragB frag_b;
+  frag_b[0] = __hsub2(*reinterpret_cast<half2 *>(&lo),
+                      *reinterpret_cast<const half2 *>(&SUB));
+  frag_b[1] = __hfma2(*reinterpret_cast<half2 *>(&hi),
+                      *reinterpret_cast<const half2 *>(&MUL),
+                      *reinterpret_cast<const half2 *>(&ADD));
+  return frag_b;
+}
+// Multiply dequantized values by the corresponding quantization scale; used
+// only for grouped quantization.
+__device__ inline void scale(FragB &frag_b, FragS &frag_s, int i) {
+  half2 s = __half2half2(reinterpret_cast<__half *>(&frag_s)[i]);
+  frag_b[0] = __hmul2(frag_b[0], s);
+  frag_b[1] = __hmul2(frag_b[1], s);
+}
+// Same as above, but for act_order (each K is multiplied individually)
+__device__ inline void scale4(FragB &frag_b, FragS &frag_s_1, FragS &frag_s_2,
+                              FragS &frag_s_3, FragS &frag_s_4, int i) {
+  __half2 s_val_1_2;
+  s_val_1_2.x = reinterpret_cast<__half *>(&frag_s_1)[i];
+  s_val_1_2.y = reinterpret_cast<__half *>(&frag_s_2)[i];
+  __half2 s_val_3_4;
+  s_val_3_4.x = reinterpret_cast<__half *>(&frag_s_3)[i];
+  s_val_3_4.y = reinterpret_cast<__half *>(&frag_s_4)[i];
+  frag_b[0] = __hmul2(frag_b[0], s_val_1_2);
+  frag_b[1] = __hmul2(frag_b[1], s_val_3_4);
+}
+// Wait until barrier reaches `count`, then lock for current threadblock.
+__device__ inline void barrier_acquire(int *lock, int count) {
+  if (threadIdx.x == 0) {
+    int state = -1;
+    do
+      // Guarantee that subsequent writes by this threadblock will be visible
+      // globally.
+      asm volatile("ld.global.acquire.gpu.b32 %0, [%1];\n"
+                   : "=r"(state)
+                   : "l"(lock));
+    while (state != count);
+  }
+  __syncthreads();
+}
+// Release barrier and increment visitation count.
+__device__ inline void barrier_release(int *lock, bool reset = false) {
+  __syncthreads();
+  if (threadIdx.x == 0) {
+    if (reset) {
+      lock[0] = 0;
+      return;
+    }
+    int val = 1;
+    // Make sure that all writes since acquiring this barrier are visible
+    // globally, while releasing the barrier.
+    asm volatile("fence.acq_rel.gpu;\n");
+    asm volatile("red.relaxed.gpu.global.add.s32 [%0], %1;\n"
+                 :
+                 : "l"(lock), "r"(val));
+  }
+}
+// For a given "a" of size [M,K] performs a permutation of the K columns based
+// on the given "perm" indices.
+__global__ void permute_cols_kernel(int4 const *__restrict__ a_int4_ptr,
+                                    int const *__restrict__ perm_int_ptr,
+                                    int4 *__restrict__ out_int4_ptr, int size_m,
+                                    int size_k, int block_rows) {
+  int start_row = block_rows * blockIdx.x;
+  int finish_row = start_row + block_rows;
+  if (finish_row > size_m) {
+    finish_row = size_m;
+  }
+  int cur_block_rows = finish_row - start_row;
+  int row_stride = size_k * sizeof(half) / 16;
+  auto permute_row = [&](int row) {
+    int iters = size_k / default_threads;
+    int rest = size_k % default_threads;
+    int offset = row * row_stride;
+    half const *a_row_half =
+        reinterpret_cast<half const *>(a_int4_ptr + offset);
+    half *out_half = reinterpret_cast<half *>(out_int4_ptr + offset);
+    int base_k = 0;
+    for (int i = 0; i < iters; i++) {
+      int cur_k = base_k + threadIdx.x;
+      int src_pos = perm_int_ptr[cur_k];
+      out_half[cur_k] = a_row_half[src_pos];
+      base_k += default_threads;
+    }
+    if (rest) {
+      if (threadIdx.x < rest) {
+        int cur_k = base_k + threadIdx.x;
+        int src_pos = perm_int_ptr[cur_k];
+        out_half[cur_k] = a_row_half[src_pos];
+      }
+    }
+  };
+  for (int i = 0; i < cur_block_rows; i++) {
+    int cur_row = start_row + i;
+    if (cur_row < size_m) {
+      permute_row(cur_row);
+    }
+  }
+}
+template <const int threads,         // number of threads in a threadblock
+          const int thread_m_blocks, // number of 16x16 blocks in the m
+                                     // dimension (batchsize) of the threadblock
+          const int thread_n_blocks, // same for n dimension (output)
+          const int thread_k_blocks, // same for k dimension (reduction)
+          const int stages, // number of stages for the async global->shared
+                            // fetch pipeline
+          const bool has_act_order,   // whether act_order is enabled
+          const int group_blocks = -1 // number of consecutive 16x16 blocks with
+                                      // a separate quantization scale
+          >
+__global__ void
+Marlin(const int4 *__restrict__ A, // fp16 input matrix of shape mxk
+       const int4 *__restrict__ B, // 4bit quantized weight matrix of shape kxn
+       int4 *__restrict__ C,       // fp16 output buffer of shape mxn
+       const int4 *__restrict__ scales_ptr, // fp16 quantization scales of shape
+                                            // (k/groupsize)xn
+       const int *__restrict__ g_idx,       // int32 group indices of shape k
+       int num_groups, // number of scale groups per output channel
+       int prob_m,     // batch dimension m
+       int prob_n,     // output dimension n
+       int prob_k,     // reduction dimension k
+       int *locks      // extra global storage for barrier synchronization
+) {
+  // Each threadblock processes one "stripe" of the B matrix with (roughly) the
+  // same size, which might involve multiple column "slices" (of width 16 *
+  // `thread_n_blocks`). Stripes are defined as shown in the 3x3 matrix 5 SM
+  // example:
+  //   0 1 3
+  //   0 2 3
+  //   1 2 4
+  // While this kind of partitioning makes things somewhat more complicated, it
+  // ensures good utilization of all SMs for many kinds of shape and GPU
+  // configurations, while requiring as few slow global cross-threadblock
+  // reductions as possible.
+  // For larger GEMMs we run multiple batchsize 64 versions in parallel for a
+  // better partitioning with less reductions
+  int parallel = 1;
+  if (prob_m > 16 * thread_m_blocks) {
+    parallel = prob_m / (16 * thread_m_blocks);
+    prob_m = 16 * thread_m_blocks;
+  }
+  int k_tiles = prob_k / 16 / thread_k_blocks;
+  int n_tiles = prob_n / 16 / thread_n_blocks;
+  int iters = div_ceil(k_tiles * n_tiles * parallel, gridDim.x);
+  if constexpr (!has_act_order && group_blocks != -1) {
+    if (group_blocks >= thread_k_blocks) {
+      // Ensure that the number of tiles in each stripe is a multiple of the
+      // groupsize; this avoids an annoying special case where a stripe starts
+      // in the middle of group.
+      iters = (group_blocks / thread_k_blocks) *
+              div_ceil(iters, (group_blocks / thread_k_blocks));
+    }
+  }
+  int slice_row = (iters * blockIdx.x) % k_tiles;
+  int slice_col_par = (iters * blockIdx.x) / k_tiles;
+  int slice_col = slice_col_par;
+  int slice_iters; // number of threadblock tiles in the current slice
+  int slice_count =
+      0;         // total number of active threadblocks in the current slice
+  int slice_idx; // index of threadblock in current slice; numbered bottom to
+                 // top
+  // We can easily implement parallel problem execution by just remapping
+  // indices and advancing global pointers
+  if (slice_col_par >= n_tiles) {
+    A += (slice_col_par / n_tiles) * 16 * thread_m_blocks * prob_k / 8;
+    C += (slice_col_par / n_tiles) * 16 * thread_m_blocks * prob_n / 8;
+    locks += (slice_col_par / n_tiles) * n_tiles;
+    slice_col = slice_col_par % n_tiles;
+  }
+  // Compute all information about the current slice which is required for
+  // synchronization.
+  auto init_slice = [&]() {
+    slice_iters =
+        iters * (blockIdx.x + 1) - (k_tiles * slice_col_par + slice_row);
+    if (slice_iters < 0 || slice_col_par >= n_tiles * parallel)
+      slice_iters = 0;
+    if (slice_iters == 0)
+      return;
+    if (slice_row + slice_iters > k_tiles)
+      slice_iters = k_tiles - slice_row;
+    slice_count = 1;
+    slice_idx = 0;
+    int col_first = iters * div_ceil(k_tiles * slice_col_par, iters);
+    if (col_first <= k_tiles * (slice_col_par + 1)) {
+      int col_off = col_first - k_tiles * slice_col_par;
+      slice_count = div_ceil(k_tiles - col_off, iters);
+      if (col_off > 0)
+        slice_count++;
+      int delta_first = iters * blockIdx.x - col_first;
+      if (delta_first < 0 || (col_off == 0 && delta_first == 0))
+        slice_idx = slice_count - 1;
+      else {
+        slice_idx = slice_count - 1 - delta_first / iters;
+        if (col_off > 0)
+          slice_idx--;
+      }
+    }
+    if (slice_col == n_tiles) {
+      A += 16 * thread_m_blocks * prob_k / 8;
+      C += 16 * thread_m_blocks * prob_n / 8;
+      locks += n_tiles;
+      slice_col = 0;
+    }
+  };
+  init_slice();
+  // A sizes/strides
+  // stride of the A matrix in global memory
+  int a_gl_stride = prob_k / 8;
+  // stride of an A matrix tile in shared memory
+  constexpr int a_sh_stride = 16 * thread_k_blocks / 8;
+  // delta between subsequent A tiles in global memory
+  constexpr int a_gl_rd_delta_o = 16 * thread_k_blocks / 8;
+  // between subsequent accesses within a tile
+  int a_gl_rd_delta_i = a_gl_stride * (threads / a_gl_rd_delta_o);
+  // between shared memory writes
+  constexpr int a_sh_wr_delta = a_sh_stride * (threads / a_gl_rd_delta_o);
+  // between shared memory tile reads
+  constexpr int a_sh_rd_delta_o = 2 * ((threads / 32) / (thread_n_blocks / 4));
+  // within a shared memory tile
+  constexpr int a_sh_rd_delta_i = a_sh_stride * 16;
+  // overall size of a tile
+  constexpr int a_sh_stage = a_sh_stride * (16 * thread_m_blocks);
+  // number of shared write iterations for a tile
+  constexpr int a_sh_wr_iters = div_ceil(a_sh_stage, a_sh_wr_delta);
+  // B sizes/strides
+  int b_gl_stride = 16 * prob_n / 32;
+  constexpr int b_sh_stride = 32 * thread_n_blocks / 4;
+  int b_gl_rd_delta_o = b_gl_stride * thread_k_blocks;
+  int b_gl_rd_delta_i = b_gl_stride * (threads / b_sh_stride);
+  constexpr int b_sh_wr_delta = threads;
+  constexpr int b_sh_rd_delta = threads;
+  constexpr int b_sh_stage = b_sh_stride * thread_k_blocks;
+  constexpr int b_sh_wr_iters = b_sh_stage / b_sh_wr_delta;
+  // Scale sizes/strides without act_order
+  int s_gl_stride = prob_n / 8;
+  constexpr int s_sh_stride = 16 * thread_n_blocks / 8;
+  constexpr int s_tb_groups = !has_act_order && group_blocks < thread_k_blocks
+                                  ? thread_k_blocks / group_blocks
+                                  : 1;
+  constexpr int s_sh_stage = s_tb_groups * s_sh_stride;
+  int s_gl_rd_delta = s_gl_stride;
+  // Scale size/strides with act_order
+  constexpr int tb_k = 16 * thread_k_blocks;
+  constexpr int g_idx_stage = has_act_order ? (tb_k * sizeof(int)) / 16 : 0;
+  // constexpr int act_s_row_stride      = 1;
+  // int           act_s_col_stride      = act_s_row_stride * num_groups;
+  int act_s_col_stride = 1;
+  int act_s_col_warp_stride = act_s_col_stride * 8;
+  int tb_n_warps = thread_n_blocks / 4;
+  int act_s_col_tb_stride = act_s_col_warp_stride * tb_n_warps;
+  // Global A read index of current thread.
+  int a_gl_rd = a_gl_stride * (threadIdx.x / a_gl_rd_delta_o) +
+                (threadIdx.x % a_gl_rd_delta_o);
+  a_gl_rd += a_gl_rd_delta_o * slice_row;
+  // Shared write index of current thread.
+  int a_sh_wr = a_sh_stride * (threadIdx.x / a_gl_rd_delta_o) +
+                (threadIdx.x % a_gl_rd_delta_o);
+  // Shared read index.
+  int a_sh_rd =
+      a_sh_stride * ((threadIdx.x % 32) % 16) + (threadIdx.x % 32) / 16;
+  a_sh_rd += 2 * ((threadIdx.x / 32) / (thread_n_blocks / 4));
+  int b_gl_rd =
+      b_gl_stride * (threadIdx.x / b_sh_stride) + (threadIdx.x % b_sh_stride);
+  b_gl_rd += b_sh_stride * slice_col;
+  b_gl_rd += b_gl_rd_delta_o * slice_row;
+  int b_sh_wr = threadIdx.x;
+  int b_sh_rd = threadIdx.x;
+  // For act_order
+  constexpr int k_iter_size = tb_k / b_sh_wr_iters;
+  int slice_k_start = tb_k * slice_row;
+  int slice_k_finish = slice_k_start + tb_k * slice_iters;
+  int slice_k_start_shared_fetch = slice_k_start;
+  int slice_n_offset = act_s_col_tb_stride * slice_col;
+  // No act_order
+  int s_gl_rd;
+  if constexpr (!has_act_order) {
+    s_gl_rd = s_gl_stride * ((thread_k_blocks * slice_row) / group_blocks) +
+              s_sh_stride * slice_col + threadIdx.x;
+  }
+  int s_sh_wr = threadIdx.x;
+  bool s_sh_wr_pred = threadIdx.x < s_sh_stride;
+  // We use a different scale layout for grouped and column-wise quantization as
+  // we scale a `half2` tile in column-major layout in the former and in
+  // row-major in the latter case.
+  int s_sh_rd;
+  if constexpr (group_blocks != -1)
+    s_sh_rd = 8 * ((threadIdx.x / 32) % (thread_n_blocks / 4)) +
+              (threadIdx.x % 32) / 4;
+  else
+    s_sh_rd = 8 * ((threadIdx.x / 32) % (thread_n_blocks / 4)) +
+              (threadIdx.x % 32) % 4;
+  // Precompute which thread should not read memory in which iterations; this is
+  // needed if there are more threads than required for a certain tilesize or
+  // when the batchsize is not a multiple of 16.
+  bool a_sh_wr_pred[a_sh_wr_iters];
+#pragma unroll
+  for (int i = 0; i < a_sh_wr_iters; i++)
+    a_sh_wr_pred[i] = a_sh_wr_delta * i + a_sh_wr < a_sh_stride * prob_m;
+  // To ensure that writing and reading A tiles to/from shared memory, the
+  // latter in fragment format, is fully bank conflict free, we need to use a
+  // rather fancy XOR-based layout. The key here is that neither reads nor
+  // writes of the 16-byte `int4` blocks of 8 consecutive threads involve the
+  // same shared memory banks. Further, it seems (based on NSight-Compute) that
+  // each warp must also write a consecutive memory segment?
+  auto transform_a = [&](int i) {
+    int row = i / a_gl_rd_delta_o;
+    return a_gl_rd_delta_o * row + (i % a_gl_rd_delta_o) ^ row;
+  };
+  // Since the computation of this remapping is non-trivial and, due to our main
+  // loop unrolls, all shared memory accesses are static, we simply precompute
+  // both transformed reads and writes.
+  int a_sh_wr_trans[a_sh_wr_iters];
+#pragma unroll
+  for (int i = 0; i < a_sh_wr_iters; i++)
+    a_sh_wr_trans[i] = transform_a(a_sh_wr_delta * i + a_sh_wr);
+  int a_sh_rd_trans[b_sh_wr_iters][thread_m_blocks];
+#pragma unroll
+  for (int i = 0; i < b_sh_wr_iters; i++) {
+#pragma unroll
+    for (int j = 0; j < thread_m_blocks; j++)
+      a_sh_rd_trans[i][j] =
+          transform_a(a_sh_rd_delta_o * i + a_sh_rd_delta_i * j + a_sh_rd);
+  }
+  // Since B-accesses have non-constant stride they have to be computed at
+  // runtime; we break dependencies between subsequent accesses with a tile by
+  // maintining multiple pointers (we have enough registers), a tiny
+  // optimization.
+  const int4 *B_ptr[b_sh_wr_iters];
+#pragma unroll
+  for (int i = 0; i < b_sh_wr_iters; i++)
+    B_ptr[i] = B + b_gl_rd_delta_i * i + b_gl_rd;
+  extern __shared__ int4 sh[];
+  // Shared memory storage for global fetch pipelines.
+  int4 *sh_a = sh;
+  int4 *sh_b = sh_a + (stages * a_sh_stage);
+  int4 *sh_g_idx = sh_b + (stages * b_sh_stage);
+  int4 *sh_s = sh_g_idx + (stages * g_idx_stage);
+  // Register storage for double buffer of shared memory reads.
+  FragA frag_a[2][thread_m_blocks];
+  I4 frag_b_quant[2];
+  FragC frag_c[thread_m_blocks][4][2];
+  FragS frag_s[2][4];        // No act-order
+  FragS act_frag_s[2][4][4]; // For act-order
+  // Zero accumulators.
+  auto zero_accums = [&]() {
+#pragma unroll
+    for (int i = 0; i < thread_m_blocks * 4 * 2 * 4; i++)
+      reinterpret_cast<float *>(frag_c)[i] = 0;
+  };
+  int sh_first_group_id = -1;
+  int sh_num_groups = -1;
+  constexpr int sh_max_num_groups = 32;
+  auto fetch_scales_to_shared = [&](bool is_async, int first_group_id,
+                                    int last_group_id) {
+    sh_first_group_id = first_group_id;
+    sh_num_groups = last_group_id - first_group_id + 1;
+    if (sh_num_groups < sh_max_num_groups) {
+      sh_num_groups = sh_max_num_groups;
+    }
+    if (sh_first_group_id + sh_num_groups > num_groups) {
+      sh_num_groups = num_groups - sh_first_group_id;
+    }
+    int row_offset = first_group_id * s_gl_stride;
+    if (is_async) {
+      for (int i = 0; i < sh_num_groups; i++) {
+        if (threadIdx.x < s_sh_stride) {
+          cp_async4_pred(&sh_s[(i * s_sh_stride) + threadIdx.x],
+                         &scales_ptr[row_offset + (i * s_gl_stride) +
+                                     slice_n_offset + threadIdx.x]);
+        }
+      }
+    } else {
+      for (int i = 0; i < sh_num_groups; i++) {
+        if (threadIdx.x < s_sh_stride) {
+          sh_s[(i * s_sh_stride) + threadIdx.x] =
+              scales_ptr[row_offset + (i * s_gl_stride) + slice_n_offset +
+                         threadIdx.x];
+        }
+      }
+    }
+  };
+  // Asynchronously fetch the next A, B and s tile from global to the next
+  // shared memory pipeline location.
+  auto fetch_to_shared = [&](int pipe, int a_off, bool pred = true) {
+    if (pred) {
+      int4 *sh_a_stage = sh_a + a_sh_stage * pipe;
+#pragma unroll
+      for (int i = 0; i < a_sh_wr_iters; i++) {
+        cp_async4_pred(
+            &sh_a_stage[a_sh_wr_trans[i]],
+            &A[a_gl_rd_delta_i * i + a_gl_rd + a_gl_rd_delta_o * a_off],
+            a_sh_wr_pred[i]);
+      }
+      int4 *sh_b_stage = sh_b + b_sh_stage * pipe;
+#pragma unroll
+      for (int i = 0; i < b_sh_wr_iters; i++) {
+        cp_async4_stream(&sh_b_stage[b_sh_wr_delta * i + b_sh_wr], B_ptr[i]);
+        B_ptr[i] += b_gl_rd_delta_o;
+      }
+      if constexpr (has_act_order) {
+        // Fetch g_idx thread-block portion
+        int full_pipe = a_off;
+        int cur_k = slice_k_start_shared_fetch + tb_k * full_pipe;
+        if (cur_k < prob_k && cur_k < slice_k_finish) {
+          int4 *sh_g_idx_stage = sh_g_idx + g_idx_stage * pipe;
+          int4 const *cur_g_idx_stage_ptr =
+              reinterpret_cast<int4 const *>(&g_idx[cur_k]);
+          if (threadIdx.x < g_idx_stage) {
+            cp_async4_pred(&sh_g_idx_stage[threadIdx.x],
+                           &cur_g_idx_stage_ptr[threadIdx.x]);
+          }
+        }
+      } else {
+        if constexpr (group_blocks != -1) {
+          int4 *sh_s_stage = sh_s + s_sh_stage * pipe;
+          if constexpr (group_blocks >= thread_k_blocks) {
+            // Only fetch scales if this tile starts a new group
+            if (pipe % (group_blocks / thread_k_blocks) == 0) {
+              if (s_sh_wr_pred) {
+                cp_async4_stream(&sh_s_stage[s_sh_wr], &scales_ptr[s_gl_rd]);
+              }
+              s_gl_rd += s_gl_rd_delta;
+            }
+          } else {
+            for (int i = 0; i < s_tb_groups; i++) {
+              if (s_sh_wr_pred) {
+                cp_async4_stream(&sh_s_stage[i * s_sh_stride + s_sh_wr],
+                                 &scales_ptr[s_gl_rd]);
+              }
+              s_gl_rd += s_gl_rd_delta;
+            }
+          }
+        }
+      }
+    }
+    // Insert a fence even when we are winding down the pipeline to ensure that
+    // waiting is also correct at this point.
+    cp_async_fence();
+  };
+  // Wait until the next thread tile has been loaded to shared memory.
+  auto wait_for_stage = [&]() {
+    // We only have `stages - 2` active fetches since we are double buffering
+    // and can only issue the next fetch when it is guaranteed that the previous
+    // shared memory load is fully complete (as it may otherwise be
+    // overwritten).
+    cp_async_wait<stages - 2>();
+    __syncthreads();
+  };
+  // Load the next sub-tile from the current location in the shared memory pipe
+  // into the current register buffer.
+  auto fetch_to_registers = [&](int k, int pipe) {
+    int4 *sh_a_stage = sh_a + a_sh_stage * pipe;
+#pragma unroll
+    for (int i = 0; i < thread_m_blocks; i++)
+      ldsm4(frag_a[k % 2][i], &sh_a_stage[a_sh_rd_trans[k % b_sh_wr_iters][i]]);
+    int4 *sh_b_stage = sh_b + b_sh_stage * pipe;
+    frag_b_quant[k % 2] = *reinterpret_cast<I4 *>(
+        &sh_b_stage[b_sh_rd_delta * (k % b_sh_wr_iters) + b_sh_rd]);
+  };
+  bool is_same_group[stages];
+  int same_group_id[stages];
+  auto init_same_group = [&](int pipe) {
+    int4 *sh_g_idx_stage = sh_g_idx + g_idx_stage * pipe;
+    int *sh_g_idx_int_ptr = reinterpret_cast<int *>(sh_g_idx_stage);
+    int group_id_1 = sh_g_idx_int_ptr[0];
+    int group_id_2 = sh_g_idx_int_ptr[tb_k - 1];
+    is_same_group[pipe] = group_id_1 == group_id_2;
+    same_group_id[pipe] = group_id_1;
+  };
+  auto fetch_scales_to_registers = [&](int k, int full_pipe) {
+    int pipe = full_pipe % stages;
+    if constexpr (!has_act_order) {
+      // No act-order case
+      if constexpr (group_blocks != -1) {
+        if constexpr (group_blocks >= thread_k_blocks) {
+          int4 *sh_s_stage =
+              sh_s + s_sh_stage * ((group_blocks / thread_k_blocks) *
+                                   (pipe / (group_blocks / thread_k_blocks)));
+          reinterpret_cast<int4 *>(&frag_s[k % 2])[0] = sh_s_stage[s_sh_rd];
+        } else {
+          int warp_id = threadIdx.x / 32;
+          int n_warps = thread_n_blocks / 4;
+          int warp_row = warp_id / n_warps;
+          int cur_k = warp_row * 16;
+          cur_k += k_iter_size * (k % b_sh_wr_iters);
+          int k_blocks = cur_k / 16;
+          int cur_group_id = k_blocks / group_blocks;
+          int4 *sh_s_stage = sh_s + s_sh_stage * pipe;
+          reinterpret_cast<int4 *>(&frag_s[k % 2])[0] =
+              sh_s_stage[s_sh_rd + cur_group_id * s_sh_stride];
+        }
+      }
+      return;
+    }
+    // Act-order case
+    // Determine K of the "current" thread-block
+    int cur_k = slice_k_start + tb_k * full_pipe;
+    if (cur_k >= prob_k || cur_k >= slice_k_finish) {
+      return;
+    }
+    // Reset (to current thread-block) since we read g_idx portion from the
+    // shared memory
+    cur_k = 0;
+    // Progress to current iteration
+    cur_k += k_iter_size * (k % b_sh_wr_iters);
+    // Determine "position" inside the thread-block (based on warp and
+    // thread-id)
+    int warp_id = threadIdx.x / 32;
+    int n_warps =
+        thread_n_blocks / 4; // Each warp processes 4 16-size tiles over N
+    int warp_row = warp_id / n_warps;
+    int warp_col = warp_id % n_warps;
+    cur_k += warp_row * 16;
+    int th_id = threadIdx.x % 32;
+    cur_k += (th_id % 4) * 2; // Due to tensor-core layout for fp16 B matrix
+    int s_col_shift =
+        /*slice_n_offset +*/ (act_s_col_warp_stride * warp_col) +
+        (th_id / 4) * act_s_col_stride;
+    if (is_same_group[pipe]) {
+      if (k % 2 == 0) {
+        *(reinterpret_cast<int4 *>(&(act_frag_s[k % 2][0][0]))) =
+            sh_s[(same_group_id[pipe] - sh_first_group_id) * s_sh_stride +
+                 s_col_shift];
+      } else {
+        *(reinterpret_cast<int4 *>(&(act_frag_s[k % 2][0][0]))) =
+            *(reinterpret_cast<int4 *>(&(act_frag_s[(k - 1) % 2][0][0])));
+      }
+      for (int i = 1; i < 4; i++) {
+        *(reinterpret_cast<int4 *>(&(act_frag_s[k % 2][i][0]))) =
+            *(reinterpret_cast<int4 *>(&(act_frag_s[k % 2][0][0])));
+      }
+      return;
+    }
+    int4 *sh_g_idx_stage = sh_g_idx + g_idx_stage * pipe;
+    int *sh_g_idx_int_ptr = reinterpret_cast<int *>(sh_g_idx_stage);
+    constexpr int k_frag_offsets[4] = {0, 1, 8,
+                                       9}; // Tensor core offsets per thread
+#pragma unroll
+    for (int i = 0; i < 4; i++) {
+      int actual_k = cur_k + k_frag_offsets[i];
+      int group_id = sh_g_idx_int_ptr[actual_k];
+      int rel_group_id = group_id - sh_first_group_id;
+      *(reinterpret_cast<int4 *>(&(act_frag_s[k % 2][i][0]))) =
+          sh_s[rel_group_id * s_sh_stride + s_col_shift];
+    }
+  };
+  // Execute the actual tensor core matmul of a sub-tile.
+  auto matmul = [&](int k) {
+// We have the m dimension as the inner loop in order to encourage overlapping
+// dequantization and matmul operations.
+#pragma unroll
+    for (int j = 0; j < 4; j++) {
+      int b_quant = frag_b_quant[k % 2][j];
+      int b_quant_shift = b_quant >> 8;
+      FragB frag_b0 = dequant(b_quant);
+      // Apply scale to frag_b0
+      if constexpr (has_act_order) {
+        scale4(frag_b0, act_frag_s[k % 2][0][j], act_frag_s[k % 2][1][j],
+               act_frag_s[k % 2][2][j], act_frag_s[k % 2][3][j], 0);
+      } else {
+        if constexpr (group_blocks != -1) {
+          scale(frag_b0, frag_s[k % 2][j], 0);
+        }
+      }
+      FragB frag_b1 = dequant(b_quant_shift);
+      // Apply scale to frag_b1
+      if constexpr (has_act_order) {
+        scale4(frag_b1, act_frag_s[k % 2][0][j], act_frag_s[k % 2][1][j],
+               act_frag_s[k % 2][2][j], act_frag_s[k % 2][3][j], 1);
+      } else {
+        if constexpr (group_blocks != -1) {
+          scale(frag_b1, frag_s[k % 2][j], 1);
+        }
+      }
+#pragma unroll
+      for (int i = 0; i < thread_m_blocks; i++) {
+        mma(frag_a[k % 2][i], frag_b0, frag_c[i][j][0]);
+        mma(frag_a[k % 2][i], frag_b1, frag_c[i][j][1]);
+      }
+    }
+  };
+  // Since we slice across the k dimension of a tile in order to increase the
+  // number of warps while keeping the n dimension of a tile reasonable, we have
+  // multiple warps that accumulate their partial sums of the same output
+  // location; which we have to reduce over in the end. We do in shared memory.
+  auto thread_block_reduce = [&]() {
+    constexpr int red_off = threads / b_sh_stride / 2;
+    if (red_off >= 1) {
+      int red_idx = threadIdx.x / b_sh_stride;
+      constexpr int red_sh_stride = b_sh_stride * 4 * 2;
+      constexpr int red_sh_delta = b_sh_stride;
+      int red_sh_rd = red_sh_stride * (threadIdx.x / b_sh_stride) +
+                      (threadIdx.x % b_sh_stride);
+      // Parallel logarithmic shared memory reduction. We make sure to avoid any
+      // unnecessary read or write iterations, e.g., for two warps we write only
+      // once by warp 1 and read only once by warp 0.
+#pragma unroll
+      for (int m_block = 0; m_block < thread_m_blocks; m_block++) {
+#pragma unroll
+        for (int i = red_off; i > 0; i /= 2) {
+          if (i <= red_idx && red_idx < 2 * i) {
+#pragma unroll
+            for (int j = 0; j < 4 * 2; j++) {
+              int red_sh_wr =
+                  red_sh_delta * j + (red_sh_rd - red_sh_stride * i);
+              if (i < red_off) {
+                float *c_rd = reinterpret_cast<float *>(
+                    &sh[red_sh_delta * j + red_sh_rd]);
+                float *c_wr = reinterpret_cast<float *>(&sh[red_sh_wr]);
+#pragma unroll
+                for (int k = 0; k < 4; k++)
+                  reinterpret_cast<FragC *>(frag_c)[4 * 2 * m_block + j][k] +=
+                      c_rd[k] + c_wr[k];
+              }
+              sh[red_sh_wr] =
+                  reinterpret_cast<int4 *>(&frag_c)[4 * 2 * m_block + j];
+            }
+          }
+          __syncthreads();
+        }
+        if (red_idx == 0) {
+#pragma unroll
+          for (int i = 0; i < 4 * 2; i++) {
+            float *c_rd =
+                reinterpret_cast<float *>(&sh[red_sh_delta * i + red_sh_rd]);
+#pragma unroll
+            for (int j = 0; j < 4; j++)
+              reinterpret_cast<FragC *>(frag_c)[4 * 2 * m_block + i][j] +=
+                  c_rd[j];
+          }
+        }
+        __syncthreads();
+      }
+    }
+  };
+  // Since multiple threadblocks may process parts of the same column slice, we
+  // finally have to globally reduce over the results. As the striped portioning
+  // minimizes the number of such reductions and our outputs are usually rather
+  // small, we perform this reduction serially in L2 cache.
+  auto global_reduce = [&](bool first = false, bool last = false) {
+    // We are very careful here to reduce directly in the output buffer to
+    // maximize L2 cache utilization in this step. To do this, we write out
+    // results in FP16 (but still reduce with FP32 compute).
+    constexpr int active_threads = 32 * thread_n_blocks / 4;
+    if (threadIdx.x < active_threads) {
+      int c_gl_stride = prob_n / 8;
+      int c_gl_wr_delta_o = 8 * c_gl_stride;
+      int c_gl_wr_delta_i = 4 * (active_threads / 32);
+      int c_gl_wr = c_gl_stride * ((threadIdx.x % 32) / 4) +
+                    4 * (threadIdx.x / 32) + threadIdx.x % 4;
+      c_gl_wr += (2 * thread_n_blocks) * slice_col;
+      constexpr int c_sh_wr_delta = active_threads;
+      int c_sh_wr = threadIdx.x;
+      int row = (threadIdx.x % 32) / 4;
+      if (!first) {
+// Interestingly, doing direct global accesses here really seems to mess up the
+// compiler and lead to slowdowns, hence we also use async-copies even though
+// these fetches are not actually asynchronous.
+#pragma unroll
+        for (int i = 0; i < thread_m_blocks * 4; i++) {
+          cp_async4_pred(&sh[c_sh_wr + c_sh_wr_delta * i],
+                         &C[c_gl_wr + c_gl_wr_delta_o * (i / 2) +
+                            c_gl_wr_delta_i * (i % 2)],
+                         i < (thread_m_blocks - 1) * 4 ||
+                             8 * (i / 2) + row < prob_m);
+        }
+        cp_async_fence();
+        cp_async_wait<0>();
+      }
+#pragma unroll
+      for (int i = 0; i < thread_m_blocks * 4; i++) {
+        if (i < (thread_m_blocks - 1) * 4 || 8 * (i / 2) + row < prob_m) {
+          if (!first) {
+            int4 c_red = sh[c_sh_wr + i * c_sh_wr_delta];
+#pragma unroll
+            for (int j = 0; j < 2 * 4; j++) {
+              reinterpret_cast<float *>(
+                  &frag_c)[4 * 2 * 4 * (i / 4) + 4 * j + (i % 4)] +=
+                  __half2float(reinterpret_cast<__half *>(&c_red)[j]);
+            }
+          }
+          if (!last) {
+            int4 c;
+#pragma unroll
+            for (int j = 0; j < 2 * 4; j++) {
+              reinterpret_cast<__half *>(&c)[j] =
+                  __float2half(reinterpret_cast<float *>(
+                      &frag_c)[4 * 2 * 4 * (i / 4) + 4 * j + (i % 4)]);
+            }
+            C[c_gl_wr + c_gl_wr_delta_o * (i / 2) + c_gl_wr_delta_i * (i % 2)] =
+                c;
+          }
+        }
+      }
+    }
+  };
+  // Write out the reduce final result in the correct layout. We only actually
+  // reshuffle matrix fragments in this step, the reduction above is performed
+  // in fragment layout.
+  auto write_result = [&]() {
+    int c_gl_stride = prob_n / 8;
+    constexpr int c_sh_stride = 2 * thread_n_blocks + 1;
+    int c_gl_wr_delta = c_gl_stride * (threads / (2 * thread_n_blocks));
+    constexpr int c_sh_rd_delta =
+        c_sh_stride * (threads / (2 * thread_n_blocks));
+    int c_gl_wr = c_gl_stride * (threadIdx.x / (2 * thread_n_blocks)) +
+                  (threadIdx.x % (2 * thread_n_blocks));
+    c_gl_wr += (2 * thread_n_blocks) * slice_col;
+    int c_sh_wr =
+        (4 * c_sh_stride) * ((threadIdx.x % 32) / 4) + (threadIdx.x % 32) % 4;
+    c_sh_wr += 32 * (threadIdx.x / 32);
+    int c_sh_rd = c_sh_stride * (threadIdx.x / (2 * thread_n_blocks)) +
+                  (threadIdx.x % (2 * thread_n_blocks));
+    int c_gl_wr_end = c_gl_stride * prob_m;
+    // We first reorder in shared memory to guarantee the most efficient final
+    // global write patterns
+    auto write = [&](int idx, float c0, float c1, FragS &s) {
+      half2 res = __halves2half2(__float2half(c0), __float2half(c1));
+      // For per-column quantization we finally apply the scale here
+      if constexpr (!has_act_order && group_blocks == -1) {
+        res = __hmul2(res, s[0]);
+      }
+      ((half2 *)sh)[idx] = res;
+    };
+    if (threadIdx.x / 32 < thread_n_blocks / 4) {
+#pragma unroll
+      for (int i = 0; i < thread_m_blocks; i++) {
+#pragma unroll
+        for (int j = 0; j < 4; j++) {
+          int wr = c_sh_wr + 8 * j;
+          write(wr + (4 * c_sh_stride) * 0 + 0, frag_c[i][j][0][0],
+                frag_c[i][j][0][1], frag_s[j / 2][2 * (j % 2) + 0]);
+          write(wr + (4 * c_sh_stride) * 8 + 0, frag_c[i][j][0][2],
+                frag_c[i][j][0][3], frag_s[j / 2][2 * (j % 2) + 0]);
+          write(wr + (4 * c_sh_stride) * 0 + 4, frag_c[i][j][1][0],
+                frag_c[i][j][1][1], frag_s[j / 2][2 * (j % 2) + 1]);
+          write(wr + (4 * c_sh_stride) * 8 + 4, frag_c[i][j][1][2],
+                frag_c[i][j][1][3], frag_s[j / 2][2 * (j % 2) + 1]);
+        }
+        c_sh_wr += 16 * (4 * c_sh_stride);
+      }
+    }
+    __syncthreads();
+#pragma unroll
+    for (int i = 0;
+         i < div_ceil(16 * thread_m_blocks, threads / (2 * thread_n_blocks));
+         i++) {
+      if (c_gl_wr < c_gl_wr_end) {
+        C[c_gl_wr] = sh[c_sh_rd];
+        c_gl_wr += c_gl_wr_delta;
+        c_sh_rd += c_sh_rd_delta;
+      }
+    }
+  };
+  // Start global fetch and register load pipelines.
+  auto start_pipes = [&]() {
+#pragma unroll
+    for (int i = 0; i < stages - 1; i++) {
+      if (has_act_order && i == 0) {
+        int last_g_idx = slice_k_start + stages * tb_k * 2;
+        if (last_g_idx >= prob_k) {
+          last_g_idx = prob_k - 1;
+        }
+        fetch_scales_to_shared(true, g_idx[slice_k_start], g_idx[last_g_idx]);
+      }
+      fetch_to_shared(i, i, i < slice_iters);
+    }
+    zero_accums();
+    wait_for_stage();
+    init_same_group(0);
+    fetch_to_registers(0, 0);
+    fetch_scales_to_registers(0, 0);
+    a_gl_rd += a_gl_rd_delta_o * (stages - 1);
+    slice_k_start_shared_fetch += tb_k * (stages - 1);
+  };
+  if (slice_iters) {
+    start_pipes();
+  }
+  // Main loop.
+  while (slice_iters) {
+    // We unroll over both the global fetch and the register load pipeline to
+    // ensure all shared memory accesses are static. Note that both pipelines
+    // have even length meaning that the next iteration will always start at
+    // index 0.
+#pragma unroll
+    for (int pipe = 0; pipe < stages;) {
+#pragma unroll
+      for (int k = 0; k < b_sh_wr_iters; k++) {
+        fetch_to_registers(k + 1, pipe % stages);
+        fetch_scales_to_registers(k + 1, pipe);
+        if (k == b_sh_wr_iters - 2) {
+          fetch_to_shared((pipe + stages - 1) % stages, pipe,
+                          slice_iters >= stages);
+          pipe++;
+          wait_for_stage();
+          init_same_group(pipe % stages);
+        }
+        matmul(k);
+      }
+      slice_iters--;
+      if (slice_iters == 0) {
+        break;
+      }
+    }
+    a_gl_rd += a_gl_rd_delta_o * stages;
+    slice_k_start += tb_k * stages;
+    slice_k_start_shared_fetch += tb_k * stages;
+    if constexpr (has_act_order) {
+      int first_group_id = g_idx[slice_k_start];
+      int last_g_idx = slice_k_start + stages * tb_k * 2;
+      if (last_g_idx >= prob_k) {
+        last_g_idx = prob_k - 1;
+      }
+      int last_group_id = g_idx[last_g_idx];
+      if (last_group_id >= sh_first_group_id + sh_num_groups) {
+        fetch_scales_to_shared(false, first_group_id, last_group_id);
+        __syncthreads();
+      }
+    }
+    // Process results and, if necessary, proceed to the next column slice.
+    // While this pattern may not be the most readable, other ways of writing
+    // the loop seemed to noticeably worse performance after compilation.
+    if (slice_iters == 0) {
+      cp_async_wait<0>();
+      bool last = slice_idx == slice_count - 1;
+      // For per-column scales, we only fetch them here in the final step before
+      // write-out
+      if constexpr (!has_act_order && group_blocks == -1) {
+        if (last) {
+          if (s_sh_wr_pred) {
+            cp_async4_stream(&sh_s[s_sh_wr], &scales_ptr[s_gl_rd]);
+          }
+          cp_async_fence();
+        }
+      }
+      thread_block_reduce();
+      if constexpr (!has_act_order && group_blocks == -1) {
+        if (last) {
+          cp_async_wait<0>();
+          __syncthreads();
+          if (threadIdx.x / 32 < thread_n_blocks / 4) {
+            reinterpret_cast<int4 *>(&frag_s)[0] = sh_s[s_sh_rd + 0];
+            reinterpret_cast<int4 *>(&frag_s)[1] = sh_s[s_sh_rd + 4];
+          }
+        }
+      }
+      if (slice_count > 1) { // only globally reduce if there is more than one
+                             // block in a slice
+        barrier_acquire(&locks[slice_col], slice_idx);
+        global_reduce(slice_idx == 0, last);
+        barrier_release(&locks[slice_col], last);
+      }
+      if (last) // only the last block in a slice actually writes the result
+        write_result();
+      slice_row = 0;
+      slice_col_par++;
+      slice_col++;
+      init_slice();
+      if (slice_iters) {
+        a_gl_rd = a_gl_stride * (threadIdx.x / a_gl_rd_delta_o) +
+                  (threadIdx.x % a_gl_rd_delta_o);
+#pragma unroll
+        for (int i = 0; i < b_sh_wr_iters; i++)
+          B_ptr[i] += b_sh_stride - b_gl_rd_delta_o * k_tiles;
+        if (slice_col == 0) {
+#pragma unroll
+          for (int i = 0; i < b_sh_wr_iters; i++)
+            B_ptr[i] -= b_gl_stride;
+        }
+        // Update slice k/n for scales loading
+        if constexpr (has_act_order) {
+          slice_k_start = tb_k * slice_row;
+          slice_k_finish = slice_k_start + tb_k * slice_iters;
+          slice_k_start_shared_fetch = slice_k_start;
+          slice_n_offset = act_s_col_tb_stride * slice_col;
+        } else {
+          s_gl_rd = s_sh_stride * slice_col + threadIdx.x;
+        }
+        // if (blockIdx.x == 0 && threadIdx.x == 0) {
+        //   printf("Move\n");
+        // }
+        start_pipes();
+      }
+    }
+  }
+}
+#define __CALL_IF(THREAD_M_BLOCKS, THREAD_N_BLOCKS, THREAD_K_BLOCKS,           \
+                  HAS_ACT_ORDER, GROUP_BLOCKS, NUM_THREADS)                    \
+  else if (thread_m_blocks == THREAD_M_BLOCKS &&                               \
+           thread_n_blocks == THREAD_N_BLOCKS &&                               \
+           thread_k_blocks == THREAD_K_BLOCKS &&                               \
+           has_act_order == HAS_ACT_ORDER && group_blocks == GROUP_BLOCKS &&   \
+           num_threads == NUM_THREADS) {                                       \
+    cudaFuncSetAttribute(                                                      \
+        Marlin<NUM_THREADS, THREAD_M_BLOCKS, THREAD_N_BLOCKS, THREAD_K_BLOCKS, \
+               pipe_stages, HAS_ACT_ORDER, GROUP_BLOCKS>,                      \
+        cudaFuncAttributeMaxDynamicSharedMemorySize, max_shared_mem);          \
+    Marlin<NUM_THREADS, THREAD_M_BLOCKS, THREAD_N_BLOCKS, THREAD_K_BLOCKS,     \
+           pipe_stages, HAS_ACT_ORDER, GROUP_BLOCKS>                           \
+        <<<blocks, NUM_THREADS, max_shared_mem, stream>>>(                     \
+            A_ptr, B_ptr, C_ptr, s_ptr, g_idx_ptr, num_groups, prob_m, prob_n, \
+            prob_k, locks);                                                    \
+  }
+typedef struct {
+  int thread_k;
+  int thread_n;
+  int num_threads;
+} thread_config_t;
+thread_config_t small_batch_thread_configs[] = {
+    // Ordered by priority
+    // thread_k, thread_n, num_threads
+    {128, 128, 256}, // Default
+    {128, 64, 128},  // Reduce N 2X, same K
+    {64, 256, 256},  // Reduce K 2X, increase N 2X
+    {64, 128, 128},  // Reduce K 2X, same N
+};
+thread_config_t large_batch_thread_configs[] = {
+    // Ordered by priority
+    // thread_k, thread_n, num_threads
+    {64, 256, 256}, // Default
+    {128, 64, 128}, // Reduce N 2X, same K
+    {64, 128, 128}, // Reduce N 2X, same K
+                    // {128, 64, 128},  // Reduce N 4X, increase K 2X
+};
+bool is_valid_config(thread_config_t const &th_config, int prob_m, int prob_n,
+                     int prob_k) {
+  // Sanity
+  if (th_config.thread_k == -1 || th_config.thread_n == -1 ||
+      th_config.num_threads == -1) {
+    return false;
+  }
+  // Verify K/N are divisible by thread K/N
+  if (prob_k % th_config.thread_k != 0 || prob_n % th_config.thread_n != 0) {
+    return false;
+  }
+  // Verify min for thread K/N
+  if (th_config.thread_n < min_thread_n || th_config.thread_k < min_thread_k) {
+    return false;
+  }
+  // num_threads must be at least 128 (= 4 warps)
+  if (th_config.num_threads < 128) {
+    return false;
+  }
+  return true;
+}
+thread_config_t determine_thread_config(int prob_m, int prob_n, int prob_k) {
+  // TODO: Enable if needed after some more testing
+  if (prob_m <= 0) {
+    for (auto th_config : small_batch_thread_configs) {
+      if (is_valid_config(th_config, prob_m, prob_n, prob_k)) {
+        return th_config;
+      }
+    }
+  } else {
+    for (auto th_config : large_batch_thread_configs) {
+      if (is_valid_config(th_config, prob_m, prob_n, prob_k)) {
+        return th_config;
+      }
+    }
+  }
+  return thread_config_t{-1, -1, -1};
+}
+#define CALL_IF(N_BLOCKS, K_BLOCKS, NUM_THREADS)                               \
+  __CALL_IF(1, N_BLOCKS, K_BLOCKS, true, 0, NUM_THREADS)                       \
+  __CALL_IF(2, N_BLOCKS, K_BLOCKS, true, 0, NUM_THREADS)                       \
+  __CALL_IF(3, N_BLOCKS, K_BLOCKS, true, 0, NUM_THREADS)                       \
+  __CALL_IF(4, N_BLOCKS, K_BLOCKS, true, 0, NUM_THREADS)                       \
+                                                                               \
+  __CALL_IF(1, N_BLOCKS, K_BLOCKS, false, -1, NUM_THREADS)                     \
+  __CALL_IF(1, N_BLOCKS, K_BLOCKS, false, 2, NUM_THREADS)                      \
+  __CALL_IF(1, N_BLOCKS, K_BLOCKS, false, 4, NUM_THREADS)                      \
+  __CALL_IF(1, N_BLOCKS, K_BLOCKS, false, 8, NUM_THREADS)                      \
+                                                                               \
+  __CALL_IF(2, N_BLOCKS, K_BLOCKS, false, -1, NUM_THREADS)                     \
+  __CALL_IF(2, N_BLOCKS, K_BLOCKS, false, 2, NUM_THREADS)                      \
+  __CALL_IF(2, N_BLOCKS, K_BLOCKS, false, 4, NUM_THREADS)                      \
+  __CALL_IF(2, N_BLOCKS, K_BLOCKS, false, 8, NUM_THREADS)                      \
+                                                                               \
+  __CALL_IF(3, N_BLOCKS, K_BLOCKS, false, -1, NUM_THREADS)                     \
+  __CALL_IF(3, N_BLOCKS, K_BLOCKS, false, 2, NUM_THREADS)                      \
+  __CALL_IF(3, N_BLOCKS, K_BLOCKS, false, 4, NUM_THREADS)                      \
+  __CALL_IF(3, N_BLOCKS, K_BLOCKS, false, 8, NUM_THREADS)                      \
+                                                                               \
+  __CALL_IF(4, N_BLOCKS, K_BLOCKS, false, -1, NUM_THREADS)                     \
+  __CALL_IF(4, N_BLOCKS, K_BLOCKS, false, 2, NUM_THREADS)                      \
+  __CALL_IF(4, N_BLOCKS, K_BLOCKS, false, 4, NUM_THREADS)                      \
+  __CALL_IF(4, N_BLOCKS, K_BLOCKS, false, 8, NUM_THREADS)
+void marlin_cuda(const void *A, const void *B, void *C, void *s, void *g_idx,
+                 void *perm, void *a_tmp, int prob_m, int prob_n, int prob_k,
+                 void *workspace, bool has_act_order, bool is_k_full,
+                 int num_groups, int group_size, int dev = 0,
+                 cudaStream_t stream = 0, int thread_k = -1, int thread_n = -1,
+                 int sms = -1, int max_par = 16) {
+  TORCH_CHECK(prob_m > 0 && prob_n > 0 && prob_k > 0, "Invalid MNK = [", prob_m,
+              ", ", prob_n, ", ", prob_k, "]");
+  int tot_m = prob_m;
+  int tot_m_blocks = div_ceil(tot_m, 16);
+  int pad = 16 * tot_m_blocks - tot_m;
+  if (sms == -1) {
+    cudaDeviceGetAttribute(&sms, cudaDevAttrMultiProcessorCount, dev);
+  }
+  int max_shared_mem = 0;
+  cudaDeviceGetAttribute(&max_shared_mem,
+                         cudaDevAttrMaxSharedMemoryPerBlockOptin, dev);
+  TORCH_CHECK(max_shared_mem > 0);
+  // Set thread config
+  thread_config_t th_config;
+  if (thread_k != -1 && thread_n != -1) {
+    // User-defined config
+    th_config = thread_config_t{thread_k, thread_n, default_threads};
+  } else {
+    // Auto config
+    th_config = determine_thread_config(prob_m, prob_n, prob_k);
+  }
+  TORCH_CHECK(is_valid_config(th_config, prob_m, prob_n, prob_k),
+              "Invalid thread config: thread_k = " + str(th_config.thread_k) +
+                  ", thread_n = " + str(th_config.thread_n) +
+                  ", num_threads = " + str(th_config.num_threads) +
+                  " for MKN = [" + str(prob_m) + ", " + str(prob_k) + ", " +
+                  str(prob_n) + "]");
+  int num_threads = th_config.num_threads;
+  thread_k = th_config.thread_k;
+  thread_n = th_config.thread_n;
+  int thread_k_blocks = thread_k / 16;
+  int thread_n_blocks = thread_n / 16;
+  int blocks = sms;
+  TORCH_CHECK(prob_n % thread_n == 0, "prob_n = ", prob_n,
+              " is not divisible by thread_n = ", thread_n);
+  TORCH_CHECK(prob_k % thread_k == 0, "prob_k = ", prob_k,
+              " is not divisible by thread_k = ", thread_k);
+  int group_blocks = 0;
+  if (has_act_order) {
+    if (is_k_full) {
+      TORCH_CHECK(group_size != -1);
+      group_blocks = group_size / 16;
+      TORCH_CHECK(prob_k % group_blocks == 0, "prob_k = ", prob_k,
+                  " is not divisible by group_blocks = ", group_blocks);
+    } else {
+      TORCH_CHECK(group_size == 0);
+      group_blocks = 0;
+    }
+  } else {
+    if (group_size == -1) {
+      group_blocks = -1;
+    } else {
+      group_blocks = group_size / 16;
+      TORCH_CHECK(prob_k % group_blocks == 0, "prob_k = ", prob_k,
+                  " is not divisible by group_blocks = ", group_blocks);
+    }
+  }
+  const int4 *A_ptr = (const int4 *)A;
+  const int4 *B_ptr = (const int4 *)B;
+  int4 *C_ptr = (int4 *)C;
+  const int4 *s_ptr = (const int4 *)s;
+  const int *g_idx_ptr = (const int *)g_idx;
+  const int *perm_ptr = (const int *)perm;
+  int4 *a_tmp_ptr = (int4 *)a_tmp;
+  int *locks = (int *)workspace;
+  if (has_act_order) {
+    // Permute A columns
+    int block_rows = div_ceil(prob_m, blocks);
+    permute_cols_kernel<<<blocks, default_threads, 0, stream>>>(
+        A_ptr, perm_ptr, a_tmp_ptr, prob_m, prob_k, block_rows);
+    A_ptr = a_tmp_ptr;
+  }
+  // If we have a full K, then we can run the non-act-order version of Marlin
+  // (since the weight rows are reordered by increasing group ids, and by having
+  // a full K, we have full original groups)
+  if (is_k_full) {
+    has_act_order = false;
+  }
+  // Main loop
+  for (int i = 0; i < tot_m_blocks; i += 4) {
+    int thread_m_blocks = tot_m_blocks - i;
+    prob_m = tot_m - 16 * i;
+    int par = 1;
+    if (thread_m_blocks > 4) {
+      // Note that parallel > 1 currently only works for inputs without any
+      // padding
+      par = (16 * thread_m_blocks - pad) / 64;
+      if (par > max_par)
+        par = max_par;
+      prob_m = 64 * par;
+      i += 4 * (par - 1);
+      thread_m_blocks = 4;
+    }
+    // Define kernel configurations
+    if (false) {
+    }
+    CALL_IF(16, 4, 256)
+    CALL_IF(8, 8, 256)
+    CALL_IF(8, 4, 128)
+    CALL_IF(4, 8, 128)
+    else {
+      TORCH_CHECK(false, "Unsupported shapes: MNK = [" + str(prob_m) + ", " +
+                             str(prob_n) + ", " + str(prob_k) + "]" +
+                             ", has_act_order = " + str(has_act_order) +
+                             ", num_groups = " + str(num_groups) +
+                             ", group_size = " + str(group_size) +
+                             ", thread_m_blocks = " + str(thread_m_blocks) +
+                             ", thread_n_blocks = " + str(thread_n_blocks) +
+                             ", thread_k_blocks = " + str(thread_k_blocks));
+    }
+    A_ptr += 16 * thread_m_blocks * (prob_k / 8) * par;
+    C_ptr += 16 * thread_m_blocks * (prob_n / 8) * par;
+  }
+}
+} // namespace gptq_marlin
+torch::Tensor gptq_marlin_gemm(torch::Tensor &a, torch::Tensor &b_q_weight,
+                               torch::Tensor &b_scales, torch::Tensor &g_idx,
+                               torch::Tensor &perm, torch::Tensor &workspace,
+                               int64_t size_m, int64_t size_n, int64_t size_k,
+                               bool is_k_full) {
+  // Verify A
+  TORCH_CHECK(a.size(0) == size_m,
+              "Shape mismatch: a.size(0) = " + str(a.size(0)) +
+                  ", size_m = " + str(size_m));
+  TORCH_CHECK(a.size(1) == size_k,
+              "Shape mismatch: a.size(1) = " + str(a.size(1)) +
+                  ", size_k = " + str(size_k));
+  // Verify B
+  TORCH_CHECK(size_k % gptq_marlin::tile_size == 0,
+              "size_k = " + str(size_k) + " is not divisible by tile_size = " +
+                  str(gptq_marlin::tile_size));
+  TORCH_CHECK((size_k / gptq_marlin::tile_size) == b_q_weight.size(0),
+              "Shape mismatch: b_q_weight.size(0) = " +
+                  str(b_q_weight.size(0)) + ", size_k = " + str(size_k) +
+                  ", tile_size = " + str(gptq_marlin::tile_size));
+  TORCH_CHECK(
+      b_q_weight.size(1) % gptq_marlin::tile_size == 0,
+      "b_q_weight.size(1) = " + str(b_q_weight.size(1)) +
+          " is not divisible by tile_size = " + str(gptq_marlin::tile_size));
+  int actual_size_n = (b_q_weight.size(1) / gptq_marlin::tile_size) *
+                      gptq_marlin::pack_factor_4bit;
+  TORCH_CHECK(size_n == actual_size_n,
+              "size_n = " + str(size_n) +
+                  ", actual_size_n = " + str(actual_size_n));
+  // Verify device and strides
+  TORCH_CHECK(a.device().is_cuda(), "A is not on GPU");
+  TORCH_CHECK(a.is_contiguous(), "A is not contiguous");
+  TORCH_CHECK(b_q_weight.device().is_cuda(), "b_q_weight is not on GPU");
+  TORCH_CHECK(b_q_weight.is_contiguous(), "b_q_weight is not contiguous");
+  TORCH_CHECK(b_scales.device().is_cuda(), "b_scales is not on GPU");
+  TORCH_CHECK(b_scales.is_contiguous(), "b_scales is not contiguous");
+  TORCH_CHECK(g_idx.device().is_cuda(), "g_idx is not on GPU");
+  TORCH_CHECK(g_idx.is_contiguous(), "g_idx is not contiguous");
+  TORCH_CHECK(perm.device().is_cuda(), "perm is not on GPU");
+  TORCH_CHECK(perm.is_contiguous(), "perm is not contiguous");
+  // Alloc buffers
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(a));
+  auto options = torch::TensorOptions().dtype(a.dtype()).device(a.device());
+  torch::Tensor c = torch::empty({size_m, size_n}, options);
+  torch::Tensor a_tmp = torch::empty({size_m, size_k}, options);
+  // thread_k: `k` size of a thread_tile in `weights` (can usually be left as
+  // auto -1)
+  int thread_k = -1;
+  // thread_n: `n` size of a thread_tile in `weights` (can usually be left as
+  // auto -1)
+  int thread_n = -1;
+  // sms: number of SMs to use for the kernel (can usually be left as auto -1)
+  int sms = -1;
+  // Verify g_idx and perm
+  TORCH_CHECK((g_idx.size(0) == 0 && perm.size(0) == 0) ||
+                  (g_idx.size(0) == size_k && perm.size(0) == size_k),
+              "Unexpected g_idx.size(0) = " + str(g_idx.size(0)) +
+                  " and perm.size(0) = " + str(perm.size(0)) +
+                  ", where size_k = " + str(size_k));
+  // Detect groupsize and act_order
+  int num_groups = -1;
+  int group_size = -1;
+  bool has_act_order = g_idx.size(0) != 0;
+  int b_rank = b_scales.sizes().size();
+  TORCH_CHECK(b_rank == 2, "b_scales rank = ", b_rank, " is not 2");
+  TORCH_CHECK(b_scales.size(1) == size_n, "b_scales dim 1 = ", b_scales.size(1),
+              " is not size_n = ", size_n);
+  num_groups = b_scales.size(0);
+  if (has_act_order) {
+    if (is_k_full) {
+      TORCH_CHECK(num_groups > 1, "For act_order, num_groups must be > 1");
+      TORCH_CHECK(size_k % num_groups == 0,
+                  "size_k = " + str(size_k) +
+                      ", is not divisible by num_groups = " + str(num_groups));
+      group_size = size_k / num_groups;
+    } else {
+      group_size = 0;
+    }
+  } else {
+    if (num_groups > 1) {
+      TORCH_CHECK(size_k % num_groups == 0,
+                  "size_k = " + str(size_k) +
+                      ", is not divisible by b_scales.size(0) = " +
+                      str(b_scales.size(0)));
+      group_size = size_k / num_groups;
+    } else {
+      group_size = -1;
+    }
+  }
+  // Verify workspace size
+  TORCH_CHECK(size_n % gptq_marlin::min_thread_n == 0,
+              "size_n = " + str(size_n) +
+                  ", is not divisible by min_thread_n = " +
+                  str(gptq_marlin::min_thread_n));
+  int min_workspace_size =
+      (size_n / gptq_marlin::min_thread_n) * gptq_marlin::max_par;
+  TORCH_CHECK(workspace.numel() >= min_workspace_size,
+              "workspace.numel = " + str(workspace.numel()) +
+                  " is below min_workspace_size = " + str(min_workspace_size));
+  int dev = a.get_device();
+  gptq_marlin::marlin_cuda(
+      a.data_ptr(), b_q_weight.data_ptr(), c.data_ptr(), b_scales.data_ptr(),
+      g_idx.data_ptr(), perm.data_ptr(), a_tmp.data_ptr(), size_m, size_n,
+      size_k, workspace.data_ptr(), has_act_order, is_k_full, num_groups,
+      group_size, dev, at::cuda::getCurrentCUDAStream(dev), thread_k, thread_n,
+      sms, gptq_marlin::max_par);
+  return c;
+}
+#endif
--- a/csrc/quantization/gptq_marlin/gptq_marlin.cuh
+++ b/csrc/quantization/gptq_marlin/gptq_marlin.cuh
+#pragma once
+#include <torch/extension.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <c10/cuda/CUDAGuard.h>
+#include <cuda.h>
+#include <cuda_fp16.h>
+#include <cuda_runtime.h>
+#include <iostream>
+namespace gptq_marlin {
+// 8 warps are a good choice since every SM has 4 schedulers and having more than 1 warp per
+// schedule allows some more latency hiding. At the same time, we want relatively few warps to have
+// many registers per warp and small tiles.
+static constexpr int default_threads = 256;
+static constexpr int pipe_stages = 4; // 4 pipeline stages fit into shared memory
+static constexpr int min_thread_n = 64;
+static constexpr int min_thread_k = 64;
+static constexpr int tile_size = 16;
+static constexpr int max_par   = 16;
+static constexpr int pack_factor_4bit = 8; // We have 8 4-bit vals inside a 32 bit
+template <typename T, int n>
+struct Vec {
+  T             elems[n];
+  __device__ T& operator[](int i) { return elems[i]; }
+};
+using I4 = Vec<int, 4>;
+constexpr int div_ceil(int a, int b) { return (a + b - 1) / b; }
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
+  // No support for async
+#else
+__device__ inline void cp_async4_pred(void* smem_ptr, const void* glob_ptr, bool pred = true) {
+  const int BYTES = 16;
+  uint32_t  smem  = static_cast<uint32_t>(__cvta_generic_to_shared(smem_ptr));
+  asm volatile("{\n"
+               "   .reg .pred p;\n"
+               "   setp.ne.b32 p, %0, 0;\n"
+               "   @p cp.async.cg.shared.global [%1], [%2], %3;\n"
+               "}\n" ::"r"((int)pred),
+               "r"(smem), "l"(glob_ptr), "n"(BYTES));
+}
+__device__ inline void cp_async4_stream(void* smem_ptr, const void* glob_ptr) {
+  const int BYTES = 16;
+  uint32_t  smem  = static_cast<uint32_t>(__cvta_generic_to_shared(smem_ptr));
+  asm volatile("{\n"
+               "   .reg .b64 p;\n"
+               "   createpolicy.fractional.L2::evict_first.b64 p, 1.0;"
+               "   cp.async.cg.shared.global.L2::cache_hint [%0], [%1], %2, p;\n"
+               "}\n" ::"r"(smem),
+               "l"(glob_ptr), "n"(BYTES));
+}
+__device__ inline void cp_async_fence() { asm volatile("cp.async.commit_group;\n" ::); }
+template <int n>
+__device__ inline void cp_async_wait() {
+  asm volatile("cp.async.wait_group %0;\n" ::"n"(n));
+}
+#endif
+} // namespace gptq_marlin
--- a/csrc/quantization/gptq_marlin/gptq_marlin_repack.cu
+++ b/csrc/quantization/gptq_marlin/gptq_marlin_repack.cu
+#include "gptq_marlin.cuh"
+namespace gptq_marlin {
+static constexpr int repack_stages = 8;
+static constexpr int repack_threads = 256;
+static constexpr int tile_k_size = tile_size;
+static constexpr int tile_n_size = tile_k_size * 4;
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
+template <int const num_threads, bool const has_perm>
+__global__ void
+marlin_repack_kernel(uint32_t const *__restrict__ b_q_weight_ptr,
+                     uint32_t const *__restrict__ perm_ptr,
+                     uint32_t *__restrict__ out_ptr, int size_k, int size_n) {}
+} // namespace gptq_marlin
+torch::Tensor gptq_marlin_repack(torch::Tensor &b_q_weight, torch::Tensor &perm,
+                                 int64_t size_k, int64_t size_n) {
+  TORCH_CHECK_NOT_IMPLEMENTED(
+      false, "marlin_repack_from_gptq(..) requires CUDA_ARCH >= 8.0");
+  return torch::empty({1, 1});
+}
+#else
+template <int const num_threads, bool const has_perm>
+__global__ void
+marlin_repack_kernel(uint32_t const *__restrict__ b_q_weight_ptr,
+                     uint32_t const *__restrict__ perm_ptr,
+                     uint32_t *__restrict__ out_ptr, int size_k, int size_n) {
+  int k_tiles = size_k / tile_k_size;
+  int n_tiles = size_n / tile_n_size;
+  int block_k_tiles = div_ceil(k_tiles, gridDim.x);
+  int start_k_tile = blockIdx.x * block_k_tiles;
+  if (start_k_tile >= k_tiles) {
+    return;
+  }
+  int finish_k_tile = min(start_k_tile + block_k_tiles, k_tiles);
+  // Wait until the next thread tile has been loaded to shared memory.
+  auto wait_for_stage = [&]() {
+    // We only have `stages - 2` active fetches since we are double buffering
+    // and can only issue the next fetch when it is guaranteed that the previous
+    // shared memory load is fully complete (as it may otherwise be
+    // overwritten).
+    cp_async_wait<repack_stages - 2>();
+    __syncthreads();
+  };
+  extern __shared__ int4 sh[];
+  constexpr int perm_size = tile_k_size / 4;
+  int4 *sh_perm_ptr = sh;
+  int4 *sh_pipe_ptr = sh_perm_ptr;
+  if constexpr (has_perm) {
+    sh_pipe_ptr += perm_size;
+  }
+  constexpr int stage_n_threads = tile_n_size / 4;
+  constexpr int stage_k_threads =
+      has_perm ? tile_k_size : tile_k_size / pack_factor_4bit;
+  constexpr int stage_size = stage_k_threads * stage_n_threads;
+  auto load_perm_to_shared = [&](int k_tile_id) {
+    int first_k_int4 = (k_tile_id * tile_k_size) / 4;
+    int4 const *perm_int4_ptr = reinterpret_cast<int4 const *>(perm_ptr);
+    if (threadIdx.x < perm_size) {
+      sh_perm_ptr[threadIdx.x] = perm_int4_ptr[first_k_int4 + threadIdx.x];
+    }
+    __syncthreads();
+  };
+  auto fetch_to_shared = [&](int pipe, int k_tile_id, int n_tile_id) {
+    if (n_tile_id >= n_tiles) {
+      cp_async_fence();
+      return;
+    }
+    int first_n = n_tile_id * tile_n_size;
+    int4 *sh_ptr = sh_pipe_ptr + stage_size * pipe;
+    if constexpr (has_perm) {
+      if (threadIdx.x < stage_size) {
+        int k_id = threadIdx.x / stage_n_threads;
+        int n_id = threadIdx.x % stage_n_threads;
+        uint32_t const *sh_perm_int_ptr =
+            reinterpret_cast<uint32_t const *>(sh_perm_ptr);
+        int src_k = sh_perm_int_ptr[k_id];
+        int src_k_packed = src_k / pack_factor_4bit;
+        cp_async4_stream(
+            &sh_ptr[k_id * stage_n_threads + n_id],
+            reinterpret_cast<int4 const *>(&(
+                b_q_weight_ptr[src_k_packed * size_n + first_n + (n_id * 4)])));
+      }
+    } else {
+      if (threadIdx.x < stage_size) {
+        int k_id = threadIdx.x / stage_n_threads;
+        int n_id = threadIdx.x % stage_n_threads;
+        int first_k = k_tile_id * tile_k_size;
+        int first_k_packed = first_k / pack_factor_4bit;
+        cp_async4_stream(&sh_ptr[k_id * stage_n_threads + n_id],
+                         reinterpret_cast<int4 const *>(
+                             &(b_q_weight_ptr[(first_k_packed + k_id) * size_n +
+                                              first_n + (n_id * 4)])));
+      }
+    }
+    cp_async_fence();
+  };
+  auto repack_tile = [&](int pipe, int k_tile_id, int n_tile_id) {
+    if (n_tile_id >= n_tiles) {
+      return;
+    }
+    int warp_id = threadIdx.x / 32;
+    int th_id = threadIdx.x % 32;
+    if (warp_id >= 4) {
+      return;
+    }
+    int tc_col = th_id / 4;
+    int tc_row = (th_id % 4) * 2;
+    constexpr int tc_offsets[4] = {0, 1, 8, 9};
+    int cur_n = warp_id * 16 + tc_col;
+    constexpr int sh_stride = 64;
+    int4 *sh_stage_ptr = sh_pipe_ptr + stage_size * pipe;
+    uint32_t *sh_stage_int_ptr = reinterpret_cast<uint32_t *>(sh_stage_ptr);
+    uint32_t *sh_perm_int_ptr = reinterpret_cast<uint32_t *>(sh_perm_ptr);
+    uint32_t vals[pack_factor_4bit];
+    if constexpr (has_perm) {
+      for (int i = 0; i < 4; i++) {
+        int k_idx = tc_row + tc_offsets[i];
+        uint32_t src_k = sh_perm_int_ptr[k_idx];
+        uint32_t src_k_pos = src_k % pack_factor_4bit;
+        uint32_t b1_val = sh_stage_int_ptr[k_idx * sh_stride + cur_n];
+        uint32_t b1_cur_val = (b1_val >> (src_k_pos * 4)) & 0xf;
+        uint32_t b2_val = sh_stage_int_ptr[k_idx * sh_stride + cur_n + 8];
+        uint32_t b2_cur_val = (b2_val >> (src_k_pos * 4)) & 0xf;
+        vals[i] = b1_cur_val;
+        vals[4 + i] = b2_cur_val;
+      }
+    } else {
+      uint32_t b1_val_1 = sh_stage_int_ptr[cur_n];
+      uint32_t b1_val_2 = sh_stage_int_ptr[sh_stride + cur_n];
+      uint32_t b2_val_1 = sh_stage_int_ptr[cur_n + 8];
+      uint32_t b2_val_2 = sh_stage_int_ptr[sh_stride + cur_n + 8];
+#pragma unroll
+      for (int i = 0; i < 2; i++) {
+        int cur_elem = tc_row + tc_offsets[i];
+        vals[i] = (b1_val_1 >> (cur_elem * 4)) & 0xf;
+        vals[4 + i] = (b2_val_1 >> (cur_elem * 4)) & 0xf;
+      }
+#pragma unroll
+      for (int i = 2; i < 4; i++) {
+        int cur_elem = tc_row + tc_offsets[i] - 8;
+        vals[i] = (b1_val_2 >> (cur_elem * 4)) & 0xf;
+        vals[4 + i] = (b2_val_2 >> (cur_elem * 4)) & 0xf;
+      }
+    }
+    // Result of:
+    // https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/cutlass_extensions/include/cutlass_extensions/interleaved_numeric_conversion.h
+    constexpr int pack_idx[pack_factor_4bit] = {0, 2, 4, 6, 1, 3, 5, 7};
+    uint32_t res = 0;
+#pragma unroll
+    for (int i = 0; i < pack_factor_4bit; i++) {
+      res |= vals[pack_idx[i]] << (i * 4);
+    }
+    constexpr int tile_size = tile_k_size * tile_n_size / pack_factor_4bit;
+    int out_offset = (k_tile_id * n_tiles + n_tile_id) * tile_size;
+    out_ptr[out_offset + th_id * 4 + warp_id] = res;
+  };
+  auto start_pipes = [&](int k_tile_id, int n_tile_id) {
+#pragma unroll
+    for (int pipe = 0; pipe < repack_stages - 1; pipe++) {
+      fetch_to_shared(pipe, k_tile_id, n_tile_id + pipe);
+    }
+    wait_for_stage();
+  };
+#pragma unroll
+  for (int k_tile_id = start_k_tile; k_tile_id < finish_k_tile; k_tile_id++) {
+    int n_tile_id = 0;
+    if constexpr (has_perm) {
+      load_perm_to_shared(k_tile_id);
+    }
+    start_pipes(k_tile_id, n_tile_id);
+    while (n_tile_id < n_tiles) {
+#pragma unroll
+      for (int pipe = 0; pipe < repack_stages; pipe++) {
+        fetch_to_shared((pipe + repack_stages - 1) % repack_stages, k_tile_id,
+                        n_tile_id + pipe + repack_stages - 1);
+        repack_tile(pipe, k_tile_id, n_tile_id + pipe);
+        wait_for_stage();
+      }
+      n_tile_id += repack_stages;
+    }
+  }
+}
+} // namespace gptq_marlin
+torch::Tensor gptq_marlin_repack(torch::Tensor &b_q_weight, torch::Tensor &perm,
+                                 int64_t size_k, int64_t size_n) {
+  // Verify compatibility with marlin tile of 16x64
+  TORCH_CHECK(size_k % gptq_marlin::tile_k_size == 0, "size_k = ", size_k,
+              " is not divisible by tile_k_size = ", gptq_marlin::tile_k_size);
+  TORCH_CHECK(size_n % gptq_marlin::tile_n_size == 0, "size_n = ", size_n,
+              " is not divisible by tile_n_size = ", gptq_marlin::tile_n_size);
+  // Verify B
+  TORCH_CHECK((size_k / gptq_marlin::pack_factor_4bit) == b_q_weight.size(0),
+              "Shape mismatch: b_q_weight.size(0) = ", b_q_weight.size(0),
+              ", size_k = ", size_k,
+              ", pack_factor_4bit = ", gptq_marlin::pack_factor_4bit);
+  TORCH_CHECK(b_q_weight.size(1) == size_n,
+              "b_q_weight.size(1) = ", b_q_weight.size(1),
+              " is not size_n = ", size_n);
+  // Verify device and strides
+  TORCH_CHECK(b_q_weight.device().is_cuda(), "b_q_weight is not on GPU");
+  TORCH_CHECK(b_q_weight.is_contiguous(), "b_q_weight is not contiguous");
+  TORCH_CHECK(b_q_weight.dtype() == at::kInt, "b_q_weight type is not kInt");
+  TORCH_CHECK(perm.device().is_cuda(), "perm is not on GPU");
+  TORCH_CHECK(perm.is_contiguous(), "perm is not contiguous");
+  TORCH_CHECK(perm.dtype() == at::kInt, "perm type is not at::kInt");
+  // Alloc buffers
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(b_q_weight));
+  auto options = torch::TensorOptions()
+                     .dtype(b_q_weight.dtype())
+                     .device(b_q_weight.device());
+  torch::Tensor out = torch::empty(
+      {size_k / gptq_marlin::tile_size,
+       size_n * gptq_marlin::tile_size / gptq_marlin::pack_factor_4bit},
+      options);
+  // Detect if there is act_order
+  bool has_perm = perm.size(0) != 0;
+  // Get ptrs
+  uint32_t const *b_q_weight_ptr =
+      reinterpret_cast<uint32_t const *>(b_q_weight.data_ptr());
+  uint32_t const *perm_ptr =
+      reinterpret_cast<uint32_t const *>(perm.data_ptr());
+  uint32_t *out_ptr = reinterpret_cast<uint32_t *>(out.data_ptr());
+  // Get dev info
+  int dev = b_q_weight.get_device();
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream(dev);
+  int blocks;
+  cudaDeviceGetAttribute(&blocks, cudaDevAttrMultiProcessorCount, dev);
+  int max_shared_mem = 0;
+  cudaDeviceGetAttribute(&max_shared_mem,
+                         cudaDevAttrMaxSharedMemoryPerBlockOptin, dev);
+  TORCH_CHECK(max_shared_mem > 0);
+  if (has_perm) {
+    cudaFuncSetAttribute(
+        gptq_marlin::marlin_repack_kernel<gptq_marlin::repack_threads, true>,
+        cudaFuncAttributeMaxDynamicSharedMemorySize,
+        max_shared_mem);
+    gptq_marlin::marlin_repack_kernel<gptq_marlin::repack_threads, true>
+        <<<blocks, gptq_marlin::repack_threads, max_shared_mem,
+           stream>>>(b_q_weight_ptr, perm_ptr, out_ptr, size_k, size_n);
+  } else {
+    cudaFuncSetAttribute(
+        gptq_marlin::marlin_repack_kernel<gptq_marlin::repack_threads, false>,
+        cudaFuncAttributeMaxDynamicSharedMemorySize,
+        max_shared_mem);
+    gptq_marlin::marlin_repack_kernel<gptq_marlin::repack_threads, false>
+        <<<blocks, gptq_marlin::repack_threads, max_shared_mem,
+           stream>>>(b_q_weight_ptr, perm_ptr, out_ptr, size_k, size_n);
+  }
+  return out;
+}
+#endif
--- a/tests/models/test_gptq_marlin.py
+++ b/tests/models/test_gptq_marlin.py
+"""Compares the outputs of gptq vs gptq_marlin 
+Note: GPTQ and Marlin do not have bitwise correctness.
+As a result, in this test, we just confirm that the top selected tokens of the
+Marlin/GPTQ models are in the top 3 selections of each other.
+Note: Marlin internally uses locks to synchronize the threads. This can
+result in very slight nondeterminism for Marlin. As a result, we re-run the test
+up to 3 times to see if we pass.
+Note: This test currently fails running with --forked with the following:
+    RuntimeError: Cannot re-initialize CUDA in forked subprocess.
+    To use CUDA with multiprocessing, you must use the 'spawn' start method
+Run `pytest tests/models/test_gptq_marlin.py`.
+"""
+import os
+import pytest
+import torch
+from tests.models.utils import check_logprobs_close
+from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
+os.environ["TOKENIZERS_PARALLELISM"] = "true"
+MAX_MODEL_LEN = 1024
+capability = torch.cuda.get_device_capability()
+capability = capability[0] * 10 + capability[1]
+gptq_marlin_not_supported = (
+    capability < QUANTIZATION_METHODS["gptq_marlin"].get_min_capability())
+MODELS = [
+    # act_order==False, group_size=channelwise
+    ("robertgshaw2/zephyr-7b-beta-channelwise-gptq", "main"),
+    # act_order==False, group_size=128
+    ("TheBloke/Llama-2-7B-GPTQ", "main"),
+    # act_order==True, group_size=128
+    ("TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ", "main"),
+    # act_order==True, group_size=64
+    ("TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ", "gptq-4bit-64g-actorder_True"),
+    # act_order==True, group_size=32
+    ("TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ", "gptq-4bit-32g-actorder_True"),
+]
+@pytest.mark.flaky(reruns=2)
+@pytest.mark.skipif(gptq_marlin_not_supported,
+                    reason="gptq_marlin is not supported on this GPU type.")
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("dtype", ["half"])
+@pytest.mark.parametrize("max_tokens", [32])
+@pytest.mark.parametrize("num_logprobs", [5])
+def test_models(
+    vllm_runner,
+    example_prompts,
+    model,
+    dtype: str,
+    max_tokens: int,
+    num_logprobs: int,
+) -> None:
+    model_name, revision = model
+    # Run marlin.
+    gptq_marlin_model = vllm_runner(model_name=model_name,
+                                    revision=revision,
+                                    dtype=dtype,
+                                    quantization="marlin",
+                                    max_model_len=MAX_MODEL_LEN,
+                                    tensor_parallel_size=1,
+                                    disable_custom_all_reduce=True)
+    gptq_marlin_outputs = gptq_marlin_model.generate_greedy_logprobs(
+        example_prompts, max_tokens, num_logprobs)
+    del gptq_marlin_model
+    # Run gptq.
+    gptq_model = vllm_runner(model_name=model_name,
+                             revision=revision,
+                             dtype=dtype,
+                             quantization="gptq",
+                             max_model_len=MAX_MODEL_LEN,
+                             tensor_parallel_size=1,
+                             disable_custom_all_reduce=True)
+    gptq_outputs = gptq_model.generate_greedy_logprobs(example_prompts,
+                                                       max_tokens,
+                                                       num_logprobs)
+    del gptq_model
+    check_logprobs_close(
+        outputs_0_lst=gptq_outputs,
+        outputs_1_lst=gptq_marlin_outputs,
+        name_0="gptq",
+        name_1="gptq_marlin",
+    )
--- a/tests/models/test_marlin.py
+++ b/tests/models/test_marlin.py
@@ -10,12 +10,12 @@ up to 3 times to see if we pass.
 Run `pytest tests/models/test_marlin.py`.
 """
 from dataclasses import dataclass
 import pytest
 import torch
+from tests.models.utils import check_logprobs_close
 from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
 capability = torch.cuda.get_device_capability()
@@ -55,43 +55,24 @@ def test_models(
    max_tokens: int,
    num_logprobs: int,
 ) -> None:
-    marlin_model = vllm_runner(model_pair.model_marlin, dtype=dtype)
+    marlin_model = vllm_runner(model_pair.model_marlin,
+                               dtype=dtype,
+                               quantization="marlin")
    marlin_outputs = marlin_model.generate_greedy_logprobs(
        example_prompts, max_tokens, num_logprobs)
-    # Note: not sure why, but deleting just the model on Ada Lovelace
-    #   does not free the GPU memory. On Ampere, deleting the just model
-    #   frees the memory.
    del marlin_model
-    gptq_model = vllm_runner(model_pair.model_gptq, dtype=dtype)
+    gptq_model = vllm_runner(model_pair.model_gptq,
+                             dtype=dtype,
+                             quantization="gptq")
    gptq_outputs = gptq_model.generate_greedy_logprobs(example_prompts,
                                                       max_tokens,
                                                       num_logprobs)
-    # Note: not sure why, but deleting just the model on Ada Lovelace
-    #   does not free the GPU memory. On Ampere, deleting the just model
-    #   frees the memory.
    del gptq_model
-    # loop through the prompts
+    check_logprobs_close(
-    for prompt_idx in range(len(example_prompts)):
+        outputs_0_lst=gptq_outputs,
-        gptq_output_ids, gptq_output_str, gptq_logprobs = gptq_outputs[
+        outputs_1_lst=marlin_outputs,
-            prompt_idx]
+        name_0="gptq",
-        marlin_output_ids, marlin_output_str, marlin_logprobs = marlin_outputs[
+        name_1="marlin",
-            prompt_idx]
+    )
-        for idx, (gptq_output_id, marlin_output_id) in enumerate(
-                zip(gptq_output_ids, marlin_output_ids)):
-            # If sequence is not an exact match,
-            if marlin_output_id != gptq_output_id:
-                # Each predicted token must be in top 5 of the other's
-                assert gptq_output_id in marlin_logprobs[idx], (
-                    f"Test{prompt_idx}:\nGPTQ:\t{gptq_output_str!r}\n"
-                    f"Marlin:\t{marlin_output_str!r}")
-                assert marlin_output_id in gptq_logprobs[idx], (
-                    f"Test{prompt_idx}:\nGPTQ:\t{gptq_output_str!r}\n"
-                    f"Marlin:\t{marlin_output_str!r}")
-                # Break out since sequences will now diverge.
-                break
--- a/tests/models/utils.py
+++ b/tests/models/utils.py
+def check_logprobs_close(outputs_0_lst, outputs_1_lst, name_0, name_1):
+    """Compare the logprobs of two sequences generated by different models, 
+    which should be similar but not necessarily equal.
+    """
+    # Loop through responses to each prompt.
+    for prompt_idx, (outputs_0,
+                     outputs_1) in enumerate(zip(outputs_0_lst,
+                                                 outputs_1_lst)):
+        output_ids_0, output_str_0, logprobs_0 = outputs_0
+        output_ids_1, output_str_1, logprobs_1 = outputs_1
+        # Loop through generated tokens.
+        for idx, (output_id_0,
+                  output_id_1) in enumerate(zip(output_ids_0, output_ids_1)):
+            # If generated tokens don't match, then
+            if output_id_0 != output_id_1:
+                # Each predicted token must be in top N logprobs of the other
+                assert output_id_0 in logprobs_1[idx], (
+                    f"Test{prompt_idx}:"
+                    f"\n{name_0}:\t{output_str_0!r}"
+                    f"\n{name_1}:\t{output_str_1!r}")
+                assert output_id_1 in logprobs_0[idx], (
+                    f"Test{prompt_idx}:"
+                    f"\n{name_0}:\t{output_str_0!r}"
+                    f"\n{name_1}:\t{output_str_1!r}")
+                # Break out since sequences will now diverge.
+                break
--- a/tests/quantization/test_autogptq_marlin_configs.py
+++ b/tests/quantization/test_autogptq_marlin_configs.py
-"""Tests whether Marlin models can be loaded from the autogptq config.
-Run `pytest tests/quantization/test_autogptq_marlin_configs.py --forked`.
-"""
-from dataclasses import dataclass
-import pytest
-from vllm.config import ModelConfig
-@dataclass
-class ModelPair:
-    model_marlin: str
-    model_gptq: str
-# Model Id // Expected Kernel
-MODELS_QUANT_TYPE = [
-    # compat: autogptq <=0.7.1 is_marlin_format: bool
-    ("neuralmagic/TinyLlama-1.1B-Chat-v1.0-marlin", "marlin"),
-    ("TheBloke/Llama-2-7B-Chat-GPTQ", "gptq"),
-    # compat: autogptq >=0.8.0 use checkpoint_format: str
-    ("LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-Marlin-4bit", "marlin"),
-    ("LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit", "gptq")
-]
-@pytest.mark.parametrize("model_quant_type", MODELS_QUANT_TYPE)
-def test_auto_gptq(model_quant_type: str, ) -> None:
-    model_path, quant_type = model_quant_type
-    model_config_no_quant_arg = ModelConfig(
-        model_path,
-        model_path,
-        tokenizer_mode="auto",
-        trust_remote_code=False,
-        seed=0,
-        dtype="float16",
-        revision=None,
-        quantization=None  # case 1
-    )
-    model_config_quant_arg = ModelConfig(
-        model_path,
-        model_path,
-        tokenizer_mode="auto",
-        trust_remote_code=False,
-        seed=0,
-        dtype="float16",
-        revision=None,
-        quantization="gptq"  # case 2
-    )
-    assert model_config_no_quant_arg.quantization == quant_type, (
-        f"Expected quant_type == {quant_type} for {model_path}, "
-        f"but found {model_config_no_quant_arg.quantization} "
-        "for no --quantization None case")
-    assert model_config_quant_arg.quantization == quant_type, (
-        f"Expected quant_type == {quant_type} for {model_path}, "
-        f"but found {model_config_quant_arg.quantization} "
-        "for --quantization gptq case")
--- a/tests/quantization/test_configs.py
+++ b/tests/quantization/test_configs.py
+"""Tests whether Marlin models can be loaded from the autogptq config.
+Run `pytest tests/quantization/test_configs.py --forked`.
+"""
+from dataclasses import dataclass
+import pytest
+from vllm.config import ModelConfig
+@dataclass
+class ModelPair:
+    model_marlin: str
+    model_gptq: str
+# Model Id // Quantization Arg // Expected Type
+MODEL_ARG_EXPTYPES = [
+    # AUTOGPTQ
+    # compat: autogptq <=0.7.1 is_marlin_format: bool
+    # Model Serialized in Marlin Format should always use Marlin kernel.
+    ("neuralmagic/TinyLlama-1.1B-Chat-v1.0-marlin", None, "marlin"),
+    ("neuralmagic/TinyLlama-1.1B-Chat-v1.0-marlin", "marlin", "marlin"),
+    ("neuralmagic/TinyLlama-1.1B-Chat-v1.0-marlin", "gptq", "marlin"),
+    ("neuralmagic/TinyLlama-1.1B-Chat-v1.0-marlin", "awq", "ERROR"),
+    # Model Serialized in Exllama Format.
+    ("TheBloke/Llama-2-7B-Chat-GPTQ", None, "gptq_marlin"),
+    ("TheBloke/Llama-2-7B-Chat-GPTQ", "marlin", "gptq_marlin"),
+    ("TheBloke/Llama-2-7B-Chat-GPTQ", "gptq", "gptq"),
+    ("TheBloke/Llama-2-7B-Chat-GPTQ", "awq", "ERROR"),
+    # compat: autogptq >=0.8.0 use checkpoint_format: str
+    # Model Serialized in Marlin Format should always use Marlin kernel.
+    ("LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-Marlin-4bit", None, "marlin"),
+    ("LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-Marlin-4bit", "marlin", "marlin"),
+    ("LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-Marlin-4bit", "gptq", "marlin"),
+    ("LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-Marlin-4bit", "awq", "ERROR"),
+    # Model Serialized in Exllama Format.
+    ("LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit", None, "gptq_marlin"),
+    ("LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit", "marlin", "gptq_marlin"),
+    ("LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit", "gptq", "gptq"),
+    ("LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit", "awq", "ERROR"),
+    # AUTOAWQ
+    ("TheBloke/OpenHermes-2.5-Mistral-7B-AWQ", None, "awq"),
+    ("TheBloke/OpenHermes-2.5-Mistral-7B-AWQ", "awq", "awq"),
+    ("TheBloke/OpenHermes-2.5-Mistral-7B-AWQ", "marlin", "ERROR"),
+    ("TheBloke/OpenHermes-2.5-Mistral-7B-AWQ", "gptq", "ERROR"),
+]
+@pytest.mark.parametrize("model_arg_exptype", MODEL_ARG_EXPTYPES)
+def test_auto_gptq(model_arg_exptype: str) -> None:
+    model_path, quantization_arg, expected_type = model_arg_exptype
+    try:
+        model_config = ModelConfig(model_path,
+                                   model_path,
+                                   tokenizer_mode="auto",
+                                   trust_remote_code=False,
+                                   seed=0,
+                                   dtype="float16",
+                                   revision=None,
+                                   quantization=quantization_arg)
+        found_quantization_type = model_config.quantization
+    except ValueError:
+        found_quantization_type = "ERROR"
+    assert found_quantization_type == expected_type, (
+        f"Expected quant_type == {expected_type} for {model_path}, "
+        f"but found {found_quantization_type} "
+        f"for no --quantization {quantization_arg} case")
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -9,11 +9,14 @@ from packaging.version import Version
 from transformers import PretrainedConfig
 from vllm.logger import init_logger
-from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
+from vllm.model_executor.layers.quantization import (QUANTIZATION_METHODS,
+                                                     get_quantization_config)
 from vllm.transformers_utils.config import get_config, get_hf_text_config
 from vllm.utils import (get_cpu_memory, get_nvcc_cuda_version, is_cpu, is_hip,
                        is_neuron)
+GPTQMarlinConfig = get_quantization_config("gptq_marlin")
 if TYPE_CHECKING:
    from ray.util.placement_group import PlacementGroup
@@ -138,14 +141,34 @@ class ModelConfig:
            is_format_marlin = (quant_cfg.get("checkpoint_format") == "marlin"
                                or quant_cfg.get("is_marlin_format", False))
-            # Use marlin if the GPTQ model is serialized in marlin format.
+            # Check which LinearMethod the GPTQ model should use.
-            if quant_method == "gptq" and is_format_marlin:
+            if quant_method == "gptq":
+                # If serialized in Marlin format, use MarlinLinearMethod.
+                # TODO (@robertgshaw): migrate under GPTQMarlinLinearMethod.
+                if is_format_marlin:
                    logger.info("The model is serialized in Marlin format. "
                                "Using Marlin kernel.")
                    quant_method = "marlin"
                    if self.quantization == "gptq":
                        self.quantization = quant_method
+                # If convertible to Marlin format, use GPTQMarlinLinearMethod
+                # unless the user explicitly specified GPTQLinearMethod.
+                elif GPTQMarlinConfig.is_marlin_compatible(quant_cfg):
+                    if self.quantization == "gptq":
+                        logger.warning(
+                            "The model is convertible to Marlin format, but "
+                            "you specified quantization=gptq. Use "
+                            "quantization=marlin for faster inference.")
+                    else:
+                        logger.info(
+                            "The model is convertible to Marlin format. "
+                            "Using Marlin kernel.")
+                        quant_method = "gptq_marlin"
+                        if self.quantization == "marlin":
+                            self.quantization = quant_method
+            # Verify quantization configurations.
            if self.quantization is None:
                self.quantization = quant_method
            elif self.quantization != quant_method:
@@ -165,7 +188,7 @@ class ModelConfig:
                raise ValueError(
                    f"{self.quantization} quantization is currently not "
                    f"supported in ROCm.")
-            if self.quantization != "marlin":
+            if (self.quantization not in ["marlin", "gptq_marlin"]):
                logger.warning(
                    "%s quantization is not fully "
                    "optimized yet. The speed can be slower than "

--- a/vllm/model_executor/layers/quantization/__init__.py
+++ b/vllm/model_executor/layers/quantization/__init__.py
@@ -6,6 +6,8 @@ from vllm.model_executor.layers.quantization.base_config import (
    QuantizationConfig)
 from vllm.model_executor.layers.quantization.fp8 import Fp8Config
 from vllm.model_executor.layers.quantization.gptq import GPTQConfig
+from vllm.model_executor.layers.quantization.gptq_marlin import (
+    GPTQMarlinConfig)
 from vllm.model_executor.layers.quantization.marlin import MarlinConfig
 from vllm.model_executor.layers.quantization.squeezellm import SqueezeLLMConfig
@@ -15,6 +17,7 @@ QUANTIZATION_METHODS: Dict[str, Type[QuantizationConfig]] = {
    "fp8": Fp8Config,
    "gptq": GPTQConfig,
    "squeezellm": SqueezeLLMConfig,
+    "gptq_marlin": GPTQMarlinConfig,
    "marlin": MarlinConfig,
 }

--- a/vllm/model_executor/layers/quantization/gptq_marlin.py
+++ b/vllm/model_executor/layers/quantization/gptq_marlin.py
+import enum
+from enum import Enum
+from typing import Any, Dict, List, Optional
+import numpy
+import torch
+from torch.nn.parameter import Parameter
+from vllm._C import ops
+from vllm.model_executor.layers.linear import (LinearBase, LinearMethodBase,
+                                               set_weight_attrs)
+from vllm.model_executor.layers.quantization.base_config import (
+    QuantizationConfig)
+GPTQ_MARLIN_TILE = 16
+GPTQ_MARLIN_MIN_THREAD_N = 64
+GPTQ_MARLIN_MIN_THREAD_K = 128
+GPTQ_MARLIN_MAX_PARALLEL = 16
+GPTQ_MARLIN_SUPPORTED_NUM_BITS = [4]
+GPTQ_MARLIN_SUPPORTED_GROUP_SIZES = [-1, 32, 64, 128]
+GPTQ_MARLIN_SUPPORTED_SYM = [True]
+# Precompute permutations for Marlin weight and scale shuffling
+#
+# Marlin works on [16,64] tiles. The goal of the permutations
+# is to reorder the weight data so that it is compatible
+# with the tensor-core format that is described here:
+# https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#matrix-fragments-for-mma-m16n8k16-with-floating-point-type # noqa: E501
+#
+# As a result of this reordering, the vector loads inside the
+# kernel will get the data as it is needed for tensor-core
+# (without the need to use ldmatrix instructions)
+def _get_perms():
+    perm = []
+    for i in range(32):
+        perm1 = []
+        col = i // 4
+        for block in [0, 1]:
+            for row in [
+                    2 * (i % 4),
+                    2 * (i % 4) + 1,
+                    2 * (i % 4 + 4),
+                    2 * (i % 4 + 4) + 1,
+            ]:
+                perm1.append(16 * row + col + 8 * block)
+        for j in range(4):
+            perm.extend([p + 256 * j for p in perm1])
+    perm = numpy.array(perm)
+    interleave = numpy.array([0, 2, 4, 6, 1, 3, 5, 7])
+    perm = perm.reshape((-1, 8))[:, interleave].ravel()  # type: ignore
+    perm = torch.from_numpy(perm)
+    scale_perm = []
+    for i in range(8):
+        scale_perm.extend([i + 8 * j for j in range(8)])
+    scale_perm_single = []
+    for i in range(4):
+        scale_perm_single.extend(
+            [2 * i + j for j in [0, 1, 8, 9, 16, 17, 24, 25]])
+    return perm, scale_perm, scale_perm_single
+_perm, _scale_perm, _scale_perm_single = _get_perms()
+def get_pack_factor(num_bits):
+    assert num_bits in GPTQ_MARLIN_SUPPORTED_NUM_BITS, (
+        f"Unsupported num_bits = {num_bits}")
+    return 32 // num_bits
+def marlin_permute_scales(s, size_k, size_n, group_size):
+    if group_size < size_k and group_size != -1:
+        s = s.reshape((-1, len(_scale_perm)))[:, _scale_perm]
+    else:
+        s = s.reshape((-1, len(_scale_perm_single)))[:, _scale_perm_single]
+    s = s.reshape((-1, size_n)).contiguous()
+    return s
+class GPTQMarlinConfig(QuantizationConfig):
+    """Config class for GPTQ Marlin"""
+    def __init__(self, weight_bits: int, group_size: int, desc_act: bool,
+                 is_sym: bool) -> None:
+        if desc_act and group_size == -1:
+            # In this case, act_order == True is the same as act_order == False
+            # (since we have only one group per output channel)
+            desc_act = False
+        self.weight_bits = weight_bits
+        self.group_size = group_size
+        self.desc_act = desc_act
+        self.is_sym = is_sym
+        # Verify
+        if self.weight_bits not in GPTQ_MARLIN_SUPPORTED_NUM_BITS:
+            raise ValueError(
+                f"Marlin does not support weight_bits = {self.weight_bits}. "
+                f"Only weight_bits = {GPTQ_MARLIN_SUPPORTED_NUM_BITS} "
+                "are supported.")
+        if self.group_size not in GPTQ_MARLIN_SUPPORTED_GROUP_SIZES:
+            raise ValueError(
+                f"Marlin does not support group_size = {self.group_size}. "
+                f"Only group_sizes = {GPTQ_MARLIN_SUPPORTED_GROUP_SIZES} "
+                "are supported.")
+        if self.is_sym not in GPTQ_MARLIN_SUPPORTED_SYM:
+            raise ValueError(
+                f"Marlin does not support is_sym = {self.is_sym}. "
+                f"Only sym = {GPTQ_MARLIN_SUPPORTED_SYM} are supported.")
+        # Init
+        self.pack_factor = get_pack_factor(weight_bits)
+        self.tile_size = GPTQ_MARLIN_TILE
+        self.min_thread_n = GPTQ_MARLIN_MIN_THREAD_N
+        self.min_thread_k = GPTQ_MARLIN_MIN_THREAD_K
+        self.max_parallel = GPTQ_MARLIN_MAX_PARALLEL
+    def __repr__(self) -> str:
+        return (f"GPTQMarlinConfig(weight_bits={self.weight_bits}, "
+                f"group_size={self.group_size}, "
+                f"desc_act={self.desc_act})")
+    @classmethod
+    def get_name(cls) -> str:
+        return "gptq_marlin"
+    @classmethod
+    def get_supported_act_dtypes(cls) -> List[torch.dtype]:
+        return [torch.half]
+    @classmethod
+    def get_min_capability(cls) -> int:
+        return 80
+    @classmethod
+    def get_config_filenames(cls) -> List[str]:
+        return ["quantize_config.json"]
+    @classmethod
+    def from_config(cls, config: Dict[str, Any]) -> "GPTQMarlinConfig":
+        weight_bits = cls.get_from_keys(config, ["bits"])
+        group_size = cls.get_from_keys(config, ["group_size"])
+        desc_act = cls.get_from_keys(config, ["desc_act"])
+        is_sym = cls.get_from_keys(config, ["sym"])
+        return cls(weight_bits, group_size, desc_act, is_sym)
+    def get_quant_method(
+            self,
+            layer: torch.nn.Module) -> Optional["GPTQMarlinLinearMethod"]:
+        if isinstance(layer, LinearBase):
+            return GPTQMarlinLinearMethod(self)
+        return None
+    def get_scaled_act_names(self) -> List[str]:
+        return []
+    @classmethod
+    def is_marlin_compatible(cls, quant_config: Dict[str, Any]):
+        # Extract data from quant config.
+        num_bits = quant_config.get("bits", None)
+        group_size = quant_config.get("group_size", None)
+        sym = quant_config.get("sym", None)
+        desc_act = quant_config.get("desc_act", None)
+        # If we cannot find the info needed in the config, cannot convert.
+        if (num_bits is None or group_size is None or sym is None
+                or desc_act is None):
+            return False
+        # If the capability of the device is too low, cannot convert.
+        major, minor = torch.cuda.get_device_capability()
+        device_capability = major * 10 + minor
+        if device_capability < cls.get_min_capability():
+            return False
+        # Otherwise, can convert if model satisfies marlin constraints.
+        return (num_bits in GPTQ_MARLIN_SUPPORTED_NUM_BITS
+                and group_size in GPTQ_MARLIN_SUPPORTED_GROUP_SIZES
+                and sym in GPTQ_MARLIN_SUPPORTED_SYM)
+class GPTQMarlinState(Enum):
+    REPACK = enum.auto()
+    READY = enum.auto()
+class GPTQMarlinLinearMethod(LinearMethodBase):
+    """Linear method for GPTQ Marlin.
+    Args:
+        quant_config: The GPTQ Marlin quantization config.
+    """
+    def __init__(self, quant_config: GPTQMarlinConfig) -> None:
+        self.quant_config = quant_config
+    def create_weights(
+        self,
+        layer: torch.nn.Module,
+        input_size_per_partition: int,
+        output_partition_sizes: List[int],
+        input_size: int,
+        output_size: int,
+        params_dtype: torch.dtype,
+        **extra_weight_attrs,
+    ) -> None:
+        del output_size
+        # Normalize group_size
+        if self.quant_config.group_size != -1:
+            group_size = self.quant_config.group_size
+        else:
+            group_size = input_size
+        # Validate dtype
+        if params_dtype != torch.float16:
+            raise ValueError(
+                f"The params dtype must be float16, but got {params_dtype}")
+        # Validate output_size_per_partition
+        output_size_per_partition = sum(output_partition_sizes)
+        if output_size_per_partition % self.quant_config.min_thread_n != 0:
+            raise ValueError(
+                f"Weight output_size_per_partition = "
+                f"{output_size_per_partition} is not divisible by "
+                f" min_thread_n = {self.quant_config.min_thread_n}.")
+        # Validate input_size_per_partition
+        if input_size_per_partition % self.quant_config.min_thread_k != 0:
+            raise ValueError(
+                f"Weight input_size_per_partition = "
+                f"{input_size_per_partition} is not divisible "
+                f"by min_thread_k = {self.quant_config.min_thread_k}.")
+        if (group_size < input_size
+                and input_size_per_partition % group_size != 0):
+            raise ValueError(
+                f"Weight input_size_per_partition = {input_size_per_partition}"
+                f" is not divisible by group_size = {group_size}.")
+        # Detect sharding of scales/zp
+        # By default, no sharding over "input dim"
+        scales_and_zp_size = input_size // group_size
+        scales_and_zp_input_dim = None
+        if self.quant_config.desc_act:
+            # Act-order case
+            assert self.quant_config.group_size != -1
+            is_k_full = input_size_per_partition == input_size
+        else:
+            # No act-order case
+            # K is always full due to full alignment with
+            # group-size and shard of scales/zp
+            is_k_full = True
+            # If this is a row-parallel case, then shard scales/zp
+            if (input_size != input_size_per_partition
+                    and self.quant_config.group_size != -1):
+                scales_and_zp_size = input_size_per_partition // group_size
+                scales_and_zp_input_dim = 0
+        # Init buffers
+        # Quantized weights
+        qweight = Parameter(
+            torch.empty(
+                input_size_per_partition // self.quant_config.pack_factor,
+                output_size_per_partition,
+                dtype=torch.int32,
+            ),
+            requires_grad=False,
+        )
+        set_weight_attrs(
+            qweight, {
+                **extra_weight_attrs,
+                "input_dim": 0,
+                "output_dim": 1,
+                "packed_dim": 0,
+                "pack_factor": self.quant_config.pack_factor,
+            })
+        # Activation order
+        g_idx = Parameter(
+            torch.empty(
+                input_size_per_partition,
+                dtype=torch.int32,
+            ),
+            requires_grad=False,
+        )
+        # Ignore warning from fused linear layers such as QKVParallelLinear.
+        set_weight_attrs(g_idx, {
+            **extra_weight_attrs, "input_dim": 0,
+            "ignore_warning": True
+        })
+        g_idx_sort_indices = Parameter(
+            torch.empty(
+                g_idx.shape,
+                dtype=torch.int32,
+            ),
+            requires_grad=False,
+        )
+        set_weight_attrs(g_idx_sort_indices, extra_weight_attrs)
+        # Scales
+        scales = Parameter(
+            torch.empty(
+                scales_and_zp_size,
+                output_size_per_partition,
+                dtype=params_dtype,
+            ),
+            requires_grad=False,
+        )
+        set_weight_attrs(
+            scales, {
+                **extra_weight_attrs,
+                "input_dim": scales_and_zp_input_dim,
+                "output_dim": 1,
+            })
+        # Quantized zero-points
+        qzeros = Parameter(
+            torch.empty(scales_and_zp_size,
+                        output_size_per_partition //
+                        self.quant_config.pack_factor,
+                        dtype=torch.int32,
+                        device="meta"),
+            requires_grad=False,
+        )
+        set_weight_attrs(
+            qzeros, {
+                **extra_weight_attrs,
+                "input_dim": scales_and_zp_input_dim,
+                "output_dim": 1,
+                "packed_dim": 1,
+                "pack_factor": self.quant_config.pack_factor,
+            })
+        # Allocate marlin workspace
+        max_workspace_size = (
+            output_size_per_partition //
+            self.quant_config.min_thread_n) * self.quant_config.max_parallel
+        workspace = torch.zeros(max_workspace_size,
+                                dtype=torch.int,
+                                requires_grad=False)
+        layer.register_parameter("qweight", qweight)
+        layer.register_parameter("g_idx", g_idx)
+        layer.register_parameter("g_idx_sort_indices", g_idx_sort_indices)
+        layer.register_parameter("scales", scales)
+        layer.register_parameter("qzeros", qzeros)
+        layer.workspace = workspace
+        layer.input_size_per_partition = input_size_per_partition
+        layer.output_size_per_partition = output_size_per_partition
+        layer.input_size = input_size
+        layer.is_k_full = is_k_full
+        layer.marlin_state = GPTQMarlinState.REPACK
+    def apply(
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        bias: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        reshaped_x = x.reshape(-1, x.shape[-1])
+        size_m = reshaped_x.shape[0]
+        part_size_n = layer.output_size_per_partition
+        part_size_k = layer.input_size_per_partition
+        full_size_k = layer.input_size
+        out_shape = x.shape[:-1] + (part_size_n, )
+        if layer.marlin_state == GPTQMarlinState.REPACK:
+            layer.marlin_state = GPTQMarlinState.READY
+            # Newly generated tensors need to replace existing tensors that are
+            # already registered as parameters by vLLM (and won't be freed)
+            def replace_tensor(name, new_t):
+                # It is important to use resize_() here since it ensures
+                # the same buffer is reused
+                getattr(layer, name).resize_(new_t.shape)
+                getattr(layer, name).copy_(new_t)
+                del new_t
+            cur_device = layer.qweight.device
+            # Process act_order
+            if self.quant_config.desc_act:
+                # Get sorting based on g_idx
+                g_idx_sort_indices = torch.argsort(layer.g_idx).to(torch.int)
+                sorted_g_idx = layer.g_idx[g_idx_sort_indices]
+                replace_tensor("g_idx", sorted_g_idx)
+                replace_tensor("g_idx_sort_indices", g_idx_sort_indices)
+            else:
+                # Reset g_idx related tensors
+                layer.g_idx = Parameter(torch.empty(0,
+                                                    dtype=torch.int,
+                                                    device=cur_device),
+                                        requires_grad=False)
+                layer.g_idx_sort_indices = Parameter(torch.empty(
+                    0, dtype=torch.int, device=cur_device),
+                                                     requires_grad=False)
+            # Repack weights
+            marlin_qweight = ops.gptq_marlin_repack(
+                layer.qweight,
+                layer.g_idx_sort_indices,
+                part_size_k,
+                part_size_n,
+            )
+            replace_tensor("qweight", marlin_qweight)
+            # Permute scales
+            scales_size_k = part_size_k
+            scales_size_n = part_size_n
+            if self.quant_config.desc_act:
+                scales_size_k = full_size_k
+            marlin_scales = marlin_permute_scales(layer.scales, scales_size_k,
+                                                  scales_size_n,
+                                                  self.quant_config.group_size)
+            replace_tensor("scales", marlin_scales)
+        output = ops.gptq_marlin_gemm(reshaped_x, layer.qweight, layer.scales,
+                                      layer.g_idx, layer.g_idx_sort_indices,
+                                      layer.workspace, size_m, part_size_n,
+                                      part_size_k, layer.is_k_full)
+        if bias is not None:
+            output.add_(bias)  # In-place add
+        return output.reshape(out_shape)