[Major] Release v0.1.4

Support 4-bit text encoder and per-layer CPU offloading, reducing FLUX's minimum memory requirement to just 4 GiB while maintaining a 2–3× speedup. Fix various issues related to resolution, LoRA, pin memory, and runtime stability. Check out the release notes for full details!

[Major] Release v0.1.4
Support 4-bit text encoder and per-layer CPU offloading, reducing FLUX's minimum memory requirement to just 4 GiB while maintaining a 2–3× speedup. Fix various issues related to resolution, LoRA, pin memory, and runtime stability. Check out the release notes for full details!
f060b8da · Muyang Li · GitHub · f549dfc6 · 873a35be · f060b8da
Unverified Commit f060b8da authored Mar 07, 2025 by Muyang Li Committed by GitHub Mar 07, 2025
20 changed files
--- a/src/Serialization.cpp
+++ b/src/Serialization.cpp
@@ -28,6 +28,33 @@ private:
    mio::mmap_source impl;
 };

+class SafeTensors::MMapImplRead : public SafeTensors::MMapImpl {
+public:
+    MMapImplRead(const std::string &filename, bool pin) {
+        std::ifstream fin(filename, std::ios::binary);
+        fin.seekg(0, std::ios::end);
+        size_t size = fin.tellg();
+        fin.seekg(0);
+
+        if (pin) {
+            buffer = std::make_unique<BufferHost>(size);
+        } else {
+            buffer = std::make_unique<BufferMalloc>(size);
+        }
+
+        fin.read((char *)buffer->getPtr(), size);
+    }
+    virtual size_t size() override {
+        return buffer->getSize();
+    }
+    virtual const char *data() override {
+        return (const char *)buffer->getPtr();
+    }
+
+private:
+    std::unique_ptr<Buffer> buffer;
+};
+
 #ifdef __linux__ 

 #include <unistd.h>
@@ -89,26 +116,78 @@ public:
 #endif

 SafeTensors::SafeTensors(const std::string &filename) {
-    this->mapped = std::make_unique<MMapImplMio>(filename);
+    this->hostRegistered = false;
+    this->memoryPinned = false;

-    if (cudaHostRegister(const_cast<char *>(this->mapped->data()), this->mapped->size(), cudaHostRegisterPortable | cudaHostRegisterReadOnly) != cudaSuccess) {
-        spdlog::warn("Unable to pin memory: {}", cudaGetErrorString(cudaGetLastError()));
-        // mlock(const_cast<char *>(this->mapped->data()), this->mapped->size());
-#ifdef __linux__
-        spdlog::info("Try MAP_PRIVATE");
-        this->mapped.reset();
+    auto methodPrivate = [&]() {
        this->mapped = std::make_unique<MMapImplPrivate>(filename);
        checkCUDA(cudaHostRegister(const_cast<char *>(this->mapped->data()), this->mapped->size(), cudaHostRegisterPortable));
+        this->hostRegistered = true;
+        this->memoryPinned = true;
+    };
+    auto methodMio = [&]() {
+        this->mapped = std::make_unique<MMapImplMio>(filename);
+        checkCUDA(cudaHostRegister(const_cast<char *>(this->mapped->data()), this->mapped->size(), cudaHostRegisterPortable | cudaHostRegisterReadOnly));
+        this->hostRegistered = true;
+        this->memoryPinned = true;
+    };
+    auto methodRead = [&]() {
+        this->mapped = std::make_unique<MMapImplRead>(filename, true);
+        this->memoryPinned = true;
+    };
+    auto methodReadNopin = [&]() {
+        this->mapped = std::make_unique<MMapImplRead>(filename, false);
+    };
+    
+    const std::map<std::string, std::function<void()>> methods = {
+        { "PRIVATE", methodPrivate },
+        { "MIO", methodMio },
+        { "READ", methodRead },
+        { "READNOPIN", methodReadNopin },
+    };
+
+    auto tryMethod = [&](std::string name) {
+        spdlog::debug("Trying to load safetensors using method {}", name);
+        this->mapped.reset();
+        try {
+            methods.at(name)();
+            return true;
+        } catch (std::exception &e) {
+            spdlog::warn("Failed to load safetensors using method {}: {}", name, e.what());
+        }
+        return false;
+    };
+
+    if (char *env = getenv("NUNCHAKU_LOAD_METHOD")) {
+        std::string method = std::string(env);
+        tryMethod(method);
+    } else {
+
+#ifdef __linux__
+        tryMethod("PRIVATE") || tryMethod("MIO") || tryMethod("READ") || tryMethod("READNOPIN");
+#else
+        tryMethod("MIO") || tryMethod("READ") || tryMethod("READNOPIN");
 #endif
+
+    }
+
+    if (!this->mapped) {
+        throw std::runtime_error("Failed to load safetensors");
+    }
+
+    if (!this->memoryPinned) {
+        spdlog::warn("Memory not pinned");
    }

    parseHeader();
 }

 SafeTensors::~SafeTensors() {
-#ifndef _WIN32  
-    checkCUDA(cudaHostUnregister(const_cast<char *>(this->mapped->data())));
-#endif
+    if (this->hostRegistered) {
+        if (cudaHostUnregister(const_cast<char *>(this->mapped->data())) != cudaSuccess) {
+            spdlog::warn("cudaHostUnregister failed: {}", cudaGetErrorString(cudaGetLastError()));
+        }
+    }
 }

 void SafeTensors::parseHeader() {

--- a/src/Serialization.h
+++ b/src/Serialization.h
@@ -44,6 +44,7 @@ private:
    class MMapImpl;
    class MMapImplMio;
    class MMapImplPrivate;
+    class MMapImplRead;

    struct TensorInfo {
        TensorShape shape;
@@ -54,4 +55,6 @@ private:
    };
    std::map<std::string, TensorInfo> tensors;
    std::unique_ptr<MMapImpl> mapped;
+
+    bool hostRegistered, memoryPinned;
 };
\ No newline at end of file
--- a/src/Tensor.h
+++ b/src/Tensor.h
@@ -85,14 +85,15 @@ public:
        if (size == 0) {
            this->ptr = nullptr;
        }
-        checkCUDA(cudaMallocAsync(&this->ptr, size, 0));    // use default stream to sync with all other streams
+        // TODO: buffer used in multiple streams?
+        checkCUDA(cudaMallocAsync(&this->ptr, size, getCurrentCUDAStream()));
    }
    virtual ~BufferCUDA() {
        if (this->size == 0) {
            assert(!this->ptr);
            return;
        }
-        checkCUDA(cudaFreeAsync(this->ptr, 0));
+        checkCUDA(cudaFreeAsync(this->ptr, getCurrentCUDAStream()));
    }
    virtual bool isAsyncBuffer() override { 
        return true;
@@ -217,7 +218,7 @@ class Tensor {
 public:
    enum ScalarType {
        INVALID_SCALAR_TYPE,
-        INT8, INT32, INT64,
+        INT8, INT16, INT32, INT64,
        FP16, FP32, BF16,
        FP8_E4M3, FP8_E5M2,
    };
@@ -361,7 +362,7 @@ public:

    Tensor &zero_() {
        assert(this->is_contiguous());
-        checkCUDA(cudaMemset(data_ptr<char>() + shape.offset * scalar_size(), 0, shape.size() * scalar_size()));
+        checkCUDA(cudaMemsetAsync(data_ptr<char>() + shape.offset * scalar_size(), 0, shape.size() * scalar_size(), getCurrentCUDAStream()));
        return *this;
    }
    Tensor &copy_(Tensor other) {
@@ -541,6 +542,7 @@ public:

 inline const std::map<Tensor::ScalarType, size_t> Tensor::scalarSize = {
    {INT8, 1},
+    {INT16, 2},
    {INT32, 4},
    {INT64, 8},
    {FP16, 2},

--- a/src/common.h
+++ b/src/common.h
@@ -63,6 +63,49 @@ inline cudaStream_t getCurrentCUDAStream() {
    return stackCUDAStreams.top();
 }

+struct CUDAStreamContext {
+    cudaStream_t stream;
+
+    CUDAStreamContext(cudaStream_t stream) : stream(stream) {
+        stackCUDAStreams.push(stream);
+    }
+    CUDAStreamContext(const CUDAStreamContext &) = delete;
+    CUDAStreamContext(CUDAStreamContext &&) = delete;
+    
+    ~CUDAStreamContext() {
+        assert(stackCUDAStreams.top() == stream);
+        stackCUDAStreams.pop();
+    }
+};
+
+struct CUDAStreamWrapper {
+    cudaStream_t stream;
+
+    CUDAStreamWrapper() {
+        checkCUDA(cudaStreamCreate(&stream));
+    }
+    CUDAStreamWrapper(const CUDAStreamWrapper &) = delete;
+    CUDAStreamWrapper(CUDAStreamWrapper &&) = delete;
+
+    ~CUDAStreamWrapper() {
+        checkCUDA(cudaStreamDestroy(stream));
+    }
+};
+
+struct CUDAEventWrapper {
+    cudaEvent_t event;
+
+    CUDAEventWrapper(unsigned int flags = cudaEventDefault) {
+        checkCUDA(cudaEventCreateWithFlags(&event, flags));
+    }
+    CUDAEventWrapper(const CUDAEventWrapper &) = delete;
+    CUDAEventWrapper(CUDAEventWrapper &&) = delete;
+
+    ~CUDAEventWrapper() {
+        checkCUDA(cudaEventDestroy(event));
+    }
+};
+
 inline cudaDeviceProp *getCurrentDeviceProperties() {
    static thread_local cudaDeviceProp prop;
    static thread_local bool propAvailable = false;

--- a/src/interop/torch.cpp
+++ b/src/interop/torch.cpp
@@ -28,6 +28,7 @@ Tensor from_torch(at::Tensor input) {
        { at::ScalarType::Float, Tensor::FP32 },
        { at::ScalarType::Half, Tensor::FP16 },
        { at::ScalarType::BFloat16, Tensor::BF16 },
+        { at::ScalarType::Short,    Tensor::INT16 },
        { at::ScalarType::Float8_e4m3fn, Tensor::FP8_E4M3 },
        { at::ScalarType::Float8_e5m2, Tensor::FP8_E5M2 },
    };
@@ -55,6 +56,7 @@ at::Tensor to_torch(Tensor input) {
        { Tensor::FP32, at::ScalarType::Float  },
        { Tensor::FP16, at::ScalarType::Half  },
        { Tensor::BF16, at::ScalarType::BFloat16  },
+        { Tensor::INT16,  at::ScalarType::Short   },
        { Tensor::FP8_E4M3, at::ScalarType::Float8_e4m3fn },
        { Tensor::FP8_E5M2, at::ScalarType::Float8_e5m2 },
    };

--- a/src/kernels/awq/gemm_awq.cu
+++ b/src/kernels/awq/gemm_awq.cu
+#include <cuda_fp16.h>
+#include <cuda_bf16.h>
+#include "semaphore.h"
+#include "gemm_awq.h"
+//#include "../../../nunchaku/csrc/quantization/dequantize.cuh"
+#include "dequantize.cuh"
+#include <stdio.h>
+#include "../dispatch_utils.h"
+//#include "../../../nunchaku/csrc/utils.cuh"
+#include "../utils.cuh"
+
+
+#include <cuda_pipeline_primitives.h>
+
+#define kInterleave 4
+#define OP_M 16
+#define OP_N 8
+#define OP_K 16
+#define INTRIN_M 16
+#define INTRIN_N 16
+#define INTRIN_K 16
+#define WARP_SIZE 32
+#define SMEM_PAD_A 0
+#define SMEM_PAD_B 0
+#define PACK_SIZE 8
+#if (__CUDACC_VER_MAJOR__ >= 11) && (__CUDACC_VER_MINOR__ >= 4)
+#define L2_CACHEHINT(size) ".L2::" #size "B"
+#else
+#define L2_CACHEHINT(size)
+#endif
+
+#define KERNEL_LAUNCH_CODE                                                                                                                               \
+  int num_mn_tiles = (num_in_feats + CTA_M - 1) / CTA_M * (num_out_channels + CTA_N - 1) / CTA_N;                                                        \
+  Tensor _semaphores = Tensor::empty({num_mn_tiles}, Tensor::INT32, _in_feats.device());                                                                 \
+  auto semaphores = reinterpret_cast<int *>(_semaphores.data_ptr<int>());                                                                                \
+  constexpr int NUM_WARPS = (CTA_M / WARP_M) * (CTA_N / WARP_N) * (CTA_K / WARP_K);                                                                      \
+  constexpr int SCALES_SMEM_SIZE = (G >= CTA_K) ? (CTA_N / (G / CTA_K) * STAGES * 2) : (CTA_N * (CTA_K / G) * STAGES * 2);                               \
+  constexpr int kSmemByteSize = (CTA_M * (CTA_K + SMEM_PAD_A) + CTA_N * (CTA_K + SMEM_PAD_B) / kInterleave + SCALES_SMEM_SIZE) * STAGES * sizeof(f16_t); \
+  if (kSmemByteSize >= 99 * 1024)                                                                                                                        \
+  {                                                                                                                                                      \
+    printf("This kernel requires %d Bytes of shared memory, which exceeds device limit.\n", kSmemByteSize);                                              \
+    return _out_feats;                                                                                                                                   \
+  }                                                                                                                                                      \
+  int j_factors1 = num_out_channels / CTA_N / 1;                                                                                                         \
+  dim3 num_blocks((num_out_feats + CTA_M - 1) / CTA_M * j_factors1 * SPLITK);                                                                            \
+  dim3 threads_per_block(WARP_SIZE, NUM_WARPS);                                                                                                          \
+  auto kernel_func = gemm_w4a16_T1<f16_t, CTA_M, CTA_N, CTA_K, WARP_M, WARP_N, WARP_K, STAGES, G, SPLITK>;                                               \
+  cudaFuncSetAttribute(kernel_func, cudaFuncAttributeMaxDynamicSharedMemorySize, kSmemByteSize);                                                         \
+  kernel_func<<<num_blocks, threads_per_block, kSmemByteSize>>>(                                                                                         \
+      in_feats, kernel, scales, zeros, out_feats, semaphores, num_in_feats, num_out_channels, num_in_channels);
+
+template <int N>
+__inline__ __host__ __device__ int get_log_tile(int n)
+{
+  if (N >= 8 && n >= 6)
+    return 3;
+  else if (N >= 4 && n >= 3)
+    return 2;
+  else if (N >= 2 && n >= 2)
+    return 1;
+  else
+    return 0;
+}
+
+__inline__ __device__ uint2 get_block_idx_mapping(int blockIdx_x, int blockIdx_y, int log_tile)
+{
+  return make_uint2((blockIdx_x >> log_tile), (blockIdx_y << log_tile) + ((blockIdx_x) & ((1 << (log_tile)) - 1)));
+}
+
+template <int SLICES, int NUM_WARPS_MN>
+__device__ void sync_slice(int slice_id)
+{
+  if constexpr (SLICES == 1)
+  {
+    __syncthreads();
+  }
+  else
+  {
+    constexpr int SLICE_GROUP = (SLICES + 7) / 8;
+    constexpr uint32_t num_threads = NUM_WARPS_MN * WARP_SIZE;
+    const uint32_t barrier_id = slice_id / SLICE_GROUP + 1;
+    asm volatile("bar.sync %0, %1;" : : "r"(barrier_id), "n"(num_threads));
+  }
+}
+
+__inline__ __device__ uint32_t cast_smem_ptr_to_uint(void const *const ptr)
+{
+  uint32_t smem_int_ptr;
+
+  asm("{.reg .u64 smem_ptr; cvta.to.shared.u64 smem_ptr, %1; cvt.u32.u64 %0, smem_ptr; }\n"
+      : "=r"(smem_int_ptr)
+      : "l"(ptr));
+
+  return smem_int_ptr;
+}
+
+template <typename f16_t>
+__inline__ __device__ void ldmatrix_m8n8_x4_b16(f16_t *shared_warp, int ax0_0, uint32_t addr)
+{
+  static_assert(std::is_same<f16_t, half>::value || std::is_same<f16_t, __nv_bfloat16>::value,
+                "ldmatrix_m8n8_x4_b16 supports only half or __nv_bfloat16 types.");
+  asm volatile(
+      "ldmatrix.sync.aligned.m8n8.x4.shared.b16"
+      "{%0, %1, %2, %3}, [%4];"
+      : "=r"(((unsigned *)(shared_warp + (ax0_0 * 8)))[0]), "=r"(((unsigned *)(shared_warp + (ax0_0 * 8)))[1]), "=r"(((unsigned *)(shared_warp + (ax0_0 * 8)))[2]), "=r"(((unsigned *)(shared_warp + (ax0_0 * 8)))[3])
+      : "r"(addr));
+}
+
+template <typename f16_t>
+__inline__ __device__ void ldmatrix_m8n8_x4_trans_b16(f16_t *shared_warp, int ax0_0, uint32_t addr)
+{
+  static_assert(std::is_same<f16_t, half>::value || std::is_same<f16_t, __nv_bfloat16>::value,
+                "ldmatrix_m8n8_x4_trans_b16 supports only half or __nv_bfloat16 types.");
+  asm volatile(
+      "ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16"
+      "{%0, %1, %2, %3}, [%4];"
+      : "=r"(((unsigned *)(shared_warp + (ax0_0 * 8)))[0]), "=r"(((unsigned *)(shared_warp + (ax0_0 * 8)))[1]), "=r"(((unsigned *)(shared_warp + (ax0_0 * 8)))[2]), "=r"(((unsigned *)(shared_warp + (ax0_0 * 8)))[3])
+      : "r"(addr));
+}
+
+__inline__ __device__ void cp_async_cg_A(uint32_t smem_int_ptr, const uint4 *__restrict__ src, bool mask)
+{
+  const int cp_size = 16;
+  asm volatile("{"
+               "  .reg .pred p;"
+               "  setp.ne.b32 p, %0, 0;"
+               "  @p cp.async.cg.shared.global" L2_CACHEHINT(128) " [%1], [%2], %3;"
+                                                                  "}" ::"r"((int)mask),
+               "r"(smem_int_ptr),
+               "l"(src),
+               "n"(cp_size));
+}
+
+template <typename f16_t>
+__device__ __inline__ void mma_m16n8k16(float *C_warp, f16_t *A_shared_warp, f16_t *B_shared_warp);
+
+template <>
+__device__ __inline__ void mma_m16n8k16<half>(float *C_warp, half *A_shared_warp, half *B_shared_warp)
+{
+  asm volatile(
+      "mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32"
+      "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};"
+      : "=f"(((float *)C_warp)[0]), "=f"(((float *)C_warp)[1]), "=f"(((float *)C_warp)[2]), "=f"(((float *)C_warp)[3])
+      : "r"(((unsigned *)A_shared_warp)[0]), "r"(((unsigned *)A_shared_warp)[1]), "r"(((unsigned *)A_shared_warp)[2]), "r"(((unsigned *)A_shared_warp)[3]), "r"(((unsigned *)B_shared_warp)[0]), "r"(((unsigned *)B_shared_warp)[1]), "f"(((float *)C_warp)[0]), "f"(((float *)C_warp)[1]), "f"(((float *)C_warp)[2]), "f"(((float *)C_warp)[3]));
+}
+
+template <>
+__device__ __inline__ void mma_m16n8k16<__nv_bfloat16>(float *C_warp, __nv_bfloat16 *A_shared_warp, __nv_bfloat16 *B_shared_warp)
+{
+  asm volatile(
+      "mma.sync.aligned.m16n8k16.row.col.f32.bf16.bf16.f32"
+      "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};"
+      : "=f"(((float *)C_warp)[0]), "=f"(((float *)C_warp)[1]), "=f"(((float *)C_warp)[2]), "=f"(((float *)C_warp)[3])
+      : "r"(((unsigned *)A_shared_warp)[0]), "r"(((unsigned *)A_shared_warp)[1]), "r"(((unsigned *)A_shared_warp)[2]), "r"(((unsigned *)A_shared_warp)[3]), "r"(((unsigned *)B_shared_warp)[0]), "r"(((unsigned *)B_shared_warp)[1]), "f"(((float *)C_warp)[0]), "f"(((float *)C_warp)[1]), "f"(((float *)C_warp)[2]), "f"(((float *)C_warp)[3]));
+}
+
+template <typename f16_t, int CTA_M, int CTA_N, int CTA_K, int CTA_SIZE, int SHARED_K_ITERS, int STAGES>
+__device__ __inline__ void global_to_share_one_stage_A(f16_t *src, f16_t *dst, int global_nrows, int global_ncols, int cta_offset_m, int cta_offset_n, int cta_offset_k, int global_iter_k, int shared_iter_k, bool mask)
+{
+  constexpr int threads_needed = (CTA_M * CTA_K) / PACK_SIZE / SHARED_K_ITERS;
+  constexpr int threads_used = threads_needed < CTA_SIZE ? threads_needed : CTA_SIZE;
+  constexpr int total_global_iters = (CTA_M * CTA_K) / PACK_SIZE / threads_used;
+  constexpr int partial_global_iters = (total_global_iters + SHARED_K_ITERS - 1) / SHARED_K_ITERS;
+  constexpr int cta_step_m_or_n = (threads_used * PACK_SIZE) / CTA_K;
+  constexpr int warp_step_m_or_n = (WARP_SIZE * PACK_SIZE) / CTA_K;
+  constexpr int threads_per_row = CTA_K / PACK_SIZE;
+  constexpr int kSmemCol = CTA_K + SMEM_PAD_A;
+  bool local_mask = mask & (threadIdx.y * WARP_SIZE + threadIdx.x < threads_used);
+  int ld_col = (threadIdx.x % threads_per_row);
+#pragma unroll
+  for (int _global_iter = 0; _global_iter < partial_global_iters; ++_global_iter)
+  {
+    int global_iter = shared_iter_k * partial_global_iters + _global_iter;
+    int ld_row = global_iter * cta_step_m_or_n + threadIdx.y * warp_step_m_or_n + (threadIdx.x / threads_per_row);
+    int ld_col_swizzled = (ld_col ^ (ld_row) & 7) * PACK_SIZE;
+    void *dst_ptr = (void *)(dst + ld_row * kSmemCol + ld_col_swizzled);
+    uint4 *src_ptr = (uint4 *)(src + (ld_row + cta_offset_m) * global_ncols + ld_col * PACK_SIZE + global_iter_k * CTA_K + cta_offset_k); // cta_offset_m * global_ncols + global_iter * cta_step_m_or_n * global_ncols + threadIdx.y * warp_step_m_or_n * global_ncols + (threadIdx.x / threads_per_row) * global_ncols + global_iter_k * CTA_K + (threadIdx.x % threads_per_row) * PACK_SIZE);
+    if constexpr (STAGES > 1)
+    {
+      uint32_t addr = cast_smem_ptr_to_uint(dst_ptr);
+      cp_async_cg_A(addr, src_ptr, local_mask & (ld_row + cta_offset_m < global_nrows));
+    }
+    else
+    {
+      if (local_mask & (ld_row + cta_offset_m < global_nrows))
+        *(uint4 *)dst_ptr = *src_ptr;
+    }
+  }
+}
+
+template <typename f16_t, int CTA_M, int CTA_N, int CTA_K, int CTA_SIZE, int SHARED_K_ITERS, int STAGES>
+__device__ __inline__ void global_to_share_one_stage_B(f16_t *src, f16_t *dst, int global_ncols, int cta_offset_m, int cta_offset_n, int cta_offset_k, int global_iter_k, int shared_iter_k, bool mask)
+{
+  constexpr int threads_needed = (CTA_N / kInterleave * CTA_K) / PACK_SIZE / SHARED_K_ITERS;
+  constexpr int threads_used = threads_needed < CTA_SIZE ? threads_needed : CTA_SIZE;
+  constexpr int total_global_iters = (CTA_N / kInterleave * CTA_K) / PACK_SIZE / threads_used;
+  constexpr int partial_global_iters = (total_global_iters + SHARED_K_ITERS - 1) / SHARED_K_ITERS;
+  constexpr int cta_step_m_or_n = (threads_used * PACK_SIZE) / CTA_K;
+  constexpr int warp_step_m_or_n = (WARP_SIZE * PACK_SIZE) / CTA_K;
+  constexpr int threads_per_row = CTA_K / PACK_SIZE;
+  constexpr int kSmemCol = CTA_K + SMEM_PAD_B;
+  bool local_mask = mask & (threadIdx.y * WARP_SIZE + threadIdx.x < threads_used);
+#pragma unroll
+  for (int _global_iter = 0; _global_iter < partial_global_iters; ++_global_iter)
+  {
+    int global_iter = shared_iter_k * partial_global_iters + _global_iter;
+
+    int ld_row = global_iter * cta_step_m_or_n + threadIdx.y * warp_step_m_or_n + (threadIdx.x / threads_per_row);
+    int ld_col = (threadIdx.x % threads_per_row);
+    int ld_col_swizzled = ld_col ^ (ld_row % 2) & 7;
+    void *dst_ptr = (void *)(dst + (ld_row * kSmemCol + ld_col_swizzled * PACK_SIZE));
+    uint4 *src_ptr = (uint4 *)(src + global_iter_k * CTA_K + cta_offset_n / kInterleave * global_ncols + ld_row * global_ncols + ld_col * PACK_SIZE + cta_offset_k);
+    if constexpr (STAGES > 1)
+    {
+      uint32_t addr = cast_smem_ptr_to_uint(dst_ptr);
+      cp_async_cg_A(addr, src_ptr, local_mask);
+    }
+    else
+    {
+      if (local_mask)
+        *(uint4 *)dst_ptr = *src_ptr;
+    }
+  }
+}
+
+template <typename f16_t, int CTA_M, int CTA_N, int CTA_K, int CTA_SIZE, int STAGES, int G>
+__device__ __inline__ void global_to_share_one_stage_scales(f16_t *src, f16_t *dst, f16_t *src_z, f16_t *dst_z, int global_ncols, int cta_offset_m, int cta_offset_n, int cta_offset_k, int global_iter_k, int shared_iter_k, bool mask)
+{
+  constexpr int LD_AMOUNT = (G >= CTA_K) ? CTA_N : CTA_N * CTA_K / G;
+  constexpr int threads_needed = LD_AMOUNT / PACK_SIZE / 1;
+  constexpr int threads_used = threads_needed < CTA_SIZE ? threads_needed : CTA_SIZE;
+  constexpr int total_global_iters = LD_AMOUNT / PACK_SIZE / threads_used;
+  constexpr int threads_per_row = CTA_N / PACK_SIZE;
+  constexpr int kSmemCol = CTA_N;
+  bool local_mask = mask & (threadIdx.y * WARP_SIZE + threadIdx.x < threads_used);
+  int g_idx = (cta_offset_k + global_iter_k * CTA_K) / G;
+
+  void *dst_ptr = (void *)(dst + (threadIdx.x / threads_per_row) * kSmemCol + (threadIdx.x % threads_per_row) * PACK_SIZE);
+  uint4 *src_ptr = (uint4 *)(src + g_idx * global_ncols + cta_offset_n + (threadIdx.x / threads_per_row) * global_ncols + (threadIdx.x % threads_per_row) * PACK_SIZE);
+  void *dst_ptr_z = (void *)(dst_z + (threadIdx.x / threads_per_row) * kSmemCol + (threadIdx.x % threads_per_row) * PACK_SIZE);
+  uint4 *src_ptr_z = (uint4 *)(src_z + g_idx * global_ncols + cta_offset_n + (threadIdx.x / threads_per_row) * global_ncols + (threadIdx.x % threads_per_row) * PACK_SIZE);
+  if (STAGES > 1)
+  {
+    uint32_t addr = cast_smem_ptr_to_uint(dst_ptr);
+    cp_async_cg_A(addr, src_ptr, local_mask);
+    uint32_t addr_z = cast_smem_ptr_to_uint(dst_ptr_z);
+    cp_async_cg_A(addr_z, src_ptr_z, local_mask);
+  }
+  else
+  {
+    if (local_mask)
+    {
+      *(uint4 *)dst_ptr = *src_ptr;
+      *(uint4 *)dst_ptr_z = *src_ptr_z;
+    }
+  }
+}
+
+template <typename f16_t, int CTA_M, int CTA_N, int CTA_K, int STAGES, int shared_iters>
+__device__ __inline__ void share_to_reg_one_stage_A(f16_t *src, f16_t *dst, int warp_offset_m, int warp_offset_n, int warp_offset_k, int k_0_1)
+{
+  constexpr int kSmemCol = CTA_K + SMEM_PAD_A;
+
+  for (int shared_iter = 0; shared_iter < shared_iters; ++shared_iter)
+  {
+
+    int ld_row = warp_offset_m + shared_iter * OP_M + (threadIdx.x % 16);
+    int ld_col = k_0_1 * 16 + (threadIdx.x / 16) * 8 + warp_offset_k;
+    int ld_col_swizzled = ((ld_col / PACK_SIZE) ^ (ld_row) & 7) * PACK_SIZE;
+    void *addr_ptr = (void *)(src + ld_row * kSmemCol + ld_col_swizzled);
+
+    uint32_t addr = cast_smem_ptr_to_uint(addr_ptr);
+    ldmatrix_m8n8_x4_b16(dst, shared_iter, addr);
+  }
+}
+
+template <typename f16_t, int CTA_M, int CTA_N, int CTA_K, int STAGES, bool ldmatrix, int shared_iters, int G>
+__device__ __inline__ void share_to_reg_one_stage_B(f16_t *src, f16_t *src_scales, f16_t *src_zeros, f16_t *dst, f16_t *dst_fp16, int warp_offset_m, int warp_offset_n, int warp_offset_k, int k_0_1)
+{
+  using f162_t = typename packed_as<f16_t, 2>::type;
+
+  constexpr int kSmemCol = CTA_K + SMEM_PAD_B;
+  int r0 = ((threadIdx.x / 8 / 2) * 8 + threadIdx.x % 8);
+  int c0 = ((threadIdx.x / 8) % 2) * 8;
+  int r = r0 / 4;
+  int c = (r0 % 4) * 16 + c0;
+  int c_swizzled = ((c / PACK_SIZE) ^ (r % 2) & 7) * PACK_SIZE;
+
+  if constexpr (ldmatrix)
+  {
+#pragma unroll
+    for (int shared_iter = 0; shared_iter < shared_iters; ++shared_iter)
+    {
+      void *addr_ptr = (void *)(src + warp_offset_n / kInterleave * kSmemCol + shared_iter * 16 / kInterleave * kSmemCol + k_0_1 * 16 + r * kSmemCol + c_swizzled + warp_offset_k);
+      uint32_t addr = cast_smem_ptr_to_uint(addr_ptr);
+      ldmatrix_m8n8_x4_b16(dst, shared_iter, addr);
+    }
+  }
+
+#pragma unroll
+  for (int shared_iter = 0; shared_iter < shared_iters; ++shared_iter)
+  {
+    f16_t scale = src_scales[(warp_offset_k / G) * CTA_N + warp_offset_n + 16 * shared_iter + 8 * (k_0_1 % 2) + threadIdx.x / 4];
+    f16_t zero = src_zeros[(warp_offset_k / G) * CTA_N + warp_offset_n + 16 * shared_iter + 8 * (k_0_1 % 2) + threadIdx.x / 4];
+    f162_t scale2 = f162f162(scale);
+    f162_t zero2 = f162f162(zero);
+    f162_t loaded[4];
+
+    dequantize_s4_to_fp16x2(*reinterpret_cast<f162_t *>(dst + (k_0_1 % 2) * 4 + (k_0_1 / 2 * 2) + shared_iter * 8), reinterpret_cast<uint4 *>(loaded));
+#pragma unroll
+    for (int i = 0; i < 4; i++)
+    {
+      loaded[i] = __hfma2(loaded[i], scale2, zero2);
+    }
+    *reinterpret_cast<uint4 *>(dst_fp16 + shared_iter * 16 + 8 * (k_0_1 % 2)) = *reinterpret_cast<uint4 *>(loaded);
+  }
+}
+
+template <typename f16_t, int CTA_M, int CTA_N, int CTA_K, int WARP_M, int WARP_N, int WARP_K, int STAGES, int G, int SPLITK>
+__global__ void gemm_w4a16_T1(f16_t *__restrict__ A, f16_t *__restrict__ B, f16_t *__restrict__ scales, f16_t *__restrict__ zeros, f16_t *__restrict__ C, int *__restrict__ semaphores, int M, int N, int K)
+{
+  using f162_t = typename packed_as<f16_t, 2>::type;
+
+  constexpr int NUM_WARPS_MN = CTA_M / WARP_M * CTA_N / WARP_N;
+  constexpr int NUM_WARPS = NUM_WARPS_MN * CTA_K / WARP_K;
+  constexpr int CTA_SIZE = NUM_WARPS * WARP_SIZE;
+  constexpr int CTA_SIZE_MN = NUM_WARPS_MN * WARP_SIZE;
+  constexpr int SLICES = CTA_K / WARP_K;
+  int num_blocks_n = (N + CTA_N - 1) / CTA_N;
+  int num_blocks_m = (M + CTA_M - 1) / CTA_M;
+  int blockIdx_x = 0;
+  int blockIdx_y = blockIdx.x % (num_blocks_m * num_blocks_n);
+  int blockIdx_z = blockIdx.x / (num_blocks_m * num_blocks_n);
+  const int log_tile = get_log_tile<1>((N + CTA_N - 1) / CTA_N);
+  int blockIdx_m = blockIdx_y / (num_blocks_n >> log_tile);
+  int blockIdx_n = blockIdx_y % (num_blocks_n >> log_tile);
+  const uint2 block_idx_mapping = get_block_idx_mapping(blockIdx_m, blockIdx_n, log_tile);
+  blockIdx_m = block_idx_mapping.x;
+  blockIdx_n = block_idx_mapping.y;
+
+  float C_warp[CTA_M * CTA_N / CTA_SIZE_MN];
+  constexpr int kSmemPadKA = CTA_K + SMEM_PAD_A;
+  constexpr int kSmemPadKB = CTA_K + SMEM_PAD_B;
+  constexpr int kSmemSizeAPerStage = CTA_M * kSmemPadKA;
+  constexpr int kSmemSizeBPerStage = CTA_N / kInterleave * kSmemPadKB;
+  constexpr int kSmemSizeA = kSmemSizeAPerStage * STAGES;
+  constexpr int kSmemSizeB = kSmemSizeBPerStage * STAGES;
+  constexpr int scales_load_interval = G >= CTA_K ? G / CTA_K : 1;
+  constexpr int scales_per_load = G < CTA_K ? CTA_K / G : 1;
+  constexpr int kSmemSizeScales = CTA_N * STAGES / scales_load_interval * scales_per_load;
+  constexpr int kSmemSizeZeros = CTA_N * STAGES / scales_load_interval * scales_per_load;
+  extern __shared__ half mem_shared[];
+  f16_t *A_shared = reinterpret_cast<f16_t *>(mem_shared);
+  f16_t *B_shared = reinterpret_cast<f16_t *>(mem_shared + kSmemSizeA);
+  f16_t *scales_shared = reinterpret_cast<f16_t *>(mem_shared + kSmemSizeA + kSmemSizeB);
+  f16_t *zeros_shared = reinterpret_cast<f16_t *>(mem_shared + kSmemSizeA + kSmemSizeB + kSmemSizeScales);
+  float *C_shared = reinterpret_cast<float *>(mem_shared);
+  f16_t A_shared_warp_[2][WARP_M * INTRIN_K / WARP_SIZE];
+  f16_t B_shared_warp_[2][WARP_N * 32 / WARP_SIZE];
+  f16_t B_shared_warp_tmp_[2][WARP_N * 16 / WARP_SIZE];
+  int cta_offset_m = blockIdx_m * CTA_M;
+  int cta_offset_n = blockIdx_n * CTA_N;
+  int cta_offset_k = blockIdx_z * (K / SPLITK);
+  int warp_mn = threadIdx.y % NUM_WARPS_MN;
+  int slice_id = threadIdx.y / NUM_WARPS_MN;
+  int warp_offset_n = (warp_mn % (CTA_N / WARP_N)) * WARP_N;
+  int warp_offset_m = (warp_mn / (CTA_N / WARP_N)) * WARP_M;
+  int warp_offset_k = slice_id * WARP_K;
+
+  for (int i = 0; i < CTA_M * CTA_N / CTA_SIZE_MN; i++)
+    C_warp[i] = 0.0;
+
+  int gemm_iters = (K + CTA_K - 1) / CTA_K / SPLITK;
+  int k_0_0_ld = 0;
+  int k_0_0 = 0;
+  constexpr int prologue_stages = STAGES == 1 ? 1 : STAGES - 1;
+#pragma unroll
+  for (k_0_0_ld = 0; k_0_0_ld < prologue_stages; ++k_0_0_ld)
+  {
+    global_to_share_one_stage_A<f16_t, CTA_M, CTA_N, CTA_K, CTA_SIZE, 1, STAGES>(A, A_shared + k_0_0_ld * kSmemSizeAPerStage, M, K, cta_offset_m, cta_offset_n, cta_offset_k, k_0_0_ld, 0, true);
+    global_to_share_one_stage_B<f16_t, CTA_M, CTA_N, CTA_K, CTA_SIZE, 1, STAGES>(B, B_shared + k_0_0_ld * kSmemSizeBPerStage, K, cta_offset_m, cta_offset_n, cta_offset_k, k_0_0_ld, 0, true);
+    global_to_share_one_stage_scales<f16_t, CTA_M, CTA_N, CTA_K, CTA_SIZE, STAGES, G>(
+        scales, scales_shared + (k_0_0_ld / scales_load_interval * scales_per_load) * CTA_N,
+        zeros, zeros_shared + (k_0_0_ld / scales_load_interval * scales_per_load) * CTA_N,
+        N, cta_offset_m, cta_offset_n, cta_offset_k,
+        k_0_0_ld, 0, k_0_0_ld < gemm_iters && k_0_0_ld % scales_load_interval == 0);
+    if constexpr (STAGES > 1)
+      __pipeline_commit();
+  }
+  if constexpr (STAGES > 1)
+    __pipeline_wait_prior(STAGES - 2);
+  __syncthreads();
+
+  share_to_reg_one_stage_A<f16_t, CTA_M, CTA_N, CTA_K, STAGES, WARP_M / INTRIN_M>(A_shared, A_shared_warp_[0], warp_offset_m, warp_offset_n, warp_offset_k, 0);
+  share_to_reg_one_stage_B<f16_t, CTA_M, CTA_N, CTA_K, STAGES, true, WARP_N / INTRIN_N, G>(B_shared, scales_shared, zeros_shared, B_shared_warp_tmp_[0], B_shared_warp_[0], warp_offset_m, warp_offset_n, warp_offset_k, 0);
+  constexpr int SHARED_K_ITERS = WARP_K / INTRIN_K;
+
+  for (; k_0_0 < gemm_iters; ++k_0_0, ++k_0_0_ld)
+  {
+    int ld_stage = k_0_0_ld % STAGES;
+    int compute_stage = k_0_0 % STAGES;
+    f16_t *A_shared_this_compute_stage;
+    f16_t *B_shared_this_compute_stage;
+    f16_t *scales_shared_this_compute_stage;
+    f16_t *zeros_shared_this_compute_stage;
+
+#pragma unroll
+    for (int iter_k = 0; iter_k < SHARED_K_ITERS; ++iter_k)
+    {
+      A_shared_this_compute_stage = A_shared + compute_stage * kSmemSizeAPerStage;
+      B_shared_this_compute_stage = B_shared + compute_stage * kSmemSizeBPerStage;
+      scales_shared_this_compute_stage = scales_shared + (compute_stage / scales_load_interval * scales_per_load) * CTA_N;
+      zeros_shared_this_compute_stage = zeros_shared + (compute_stage / scales_load_interval * scales_per_load) * CTA_N;
+      share_to_reg_one_stage_A<f16_t, CTA_M, CTA_N, CTA_K, STAGES, WARP_M / INTRIN_M>(A_shared_this_compute_stage, A_shared_warp_[(iter_k + 1) % 2], warp_offset_m, warp_offset_n, warp_offset_k, (iter_k + 1) % SHARED_K_ITERS);
+      if ((iter_k + 1) % kInterleave == 0)
+      {
+        if (compute_stage % 2 == 1)
+        {
+          share_to_reg_one_stage_B<f16_t, CTA_M, CTA_N, CTA_K, STAGES, true, WARP_N / INTRIN_N, G>(
+              B_shared_this_compute_stage, scales_shared_this_compute_stage, zeros_shared_this_compute_stage,
+              B_shared_warp_tmp_[1], B_shared_warp_[((iter_k + 1) / 2) % 2],
+              warp_offset_m, warp_offset_n, warp_offset_k, (iter_k + 1) % SHARED_K_ITERS);
+        }
+        else
+        {
+          share_to_reg_one_stage_B<f16_t, CTA_M, CTA_N, CTA_K, STAGES, true, WARP_N / INTRIN_N, G>(
+              B_shared_this_compute_stage, scales_shared_this_compute_stage, zeros_shared_this_compute_stage,
+              B_shared_warp_tmp_[0], B_shared_warp_[((iter_k + 1) / 2) % 2],
+              warp_offset_m, warp_offset_n, warp_offset_k, (iter_k + 1) % SHARED_K_ITERS);
+        }
+      }
+      else
+      {
+        if (compute_stage % 2 == 1)
+        {
+          share_to_reg_one_stage_B<f16_t, CTA_M, CTA_N, CTA_K, STAGES, false, WARP_N / INTRIN_N, G>(
+              B_shared_this_compute_stage, scales_shared_this_compute_stage, zeros_shared_this_compute_stage,
+              B_shared_warp_tmp_[1], B_shared_warp_[((iter_k + 1) / 2) % 2],
+              warp_offset_m, warp_offset_n, warp_offset_k, (iter_k + 1) % SHARED_K_ITERS);
+        }
+        else
+        {
+          share_to_reg_one_stage_B<f16_t, CTA_M, CTA_N, CTA_K, STAGES, false, WARP_N / INTRIN_N, G>(
+              B_shared_this_compute_stage, scales_shared_this_compute_stage, zeros_shared_this_compute_stage,
+              B_shared_warp_tmp_[0], B_shared_warp_[((iter_k + 1) / 2) % 2],
+              warp_offset_m, warp_offset_n, warp_offset_k, (iter_k + 1) % SHARED_K_ITERS);
+        }
+      }
+      f16_t *A_shared_warp = A_shared_warp_[iter_k % 2];
+      f16_t *B_shared_warp = B_shared_warp_[(iter_k / 2) % 2];
+
+      for (int i_0_3 = 0; i_0_3 < WARP_M / INTRIN_M; ++i_0_3)
+      {
+        for (int j_0_4 = 0; j_0_4 < WARP_N / INTRIN_N; ++j_0_4)
+        {
+          mma_m16n8k16(C_warp + i_0_3 * WARP_N / INTRIN_N * 8 + j_0_4 * 8, A_shared_warp + i_0_3 * 8, B_shared_warp + j_0_4 * 16 + (iter_k % 2) * 4);
+          mma_m16n8k16(C_warp + i_0_3 * WARP_N / INTRIN_N * 8 + j_0_4 * 8 + 4, A_shared_warp + i_0_3 * 8, B_shared_warp + j_0_4 * 16 + (iter_k % 2) * 4 + 8);
+        }
+      }
+
+      if (iter_k < WARP_K / INTRIN_K - 1)
+      {
+        if constexpr (STAGES == 1)
+          __syncthreads();
+        global_to_share_one_stage_A<f16_t, CTA_M, CTA_N, CTA_K, CTA_SIZE, WARP_K / INTRIN_K, STAGES>(A, A_shared + ld_stage * kSmemSizeAPerStage, M, K, cta_offset_m, cta_offset_n, cta_offset_k, k_0_0_ld, iter_k, k_0_0_ld < gemm_iters);
+        global_to_share_one_stage_B<f16_t, CTA_M, CTA_N, CTA_K, CTA_SIZE, WARP_K / INTRIN_K, STAGES>(B, B_shared + ld_stage * kSmemSizeBPerStage, K, cta_offset_m, cta_offset_n, cta_offset_k, k_0_0_ld, iter_k, k_0_0_ld < gemm_iters);
+      }
+
+      if (iter_k == WARP_K / INTRIN_K - 2)
+      {
+        if constexpr (STAGES == 1 && WARP_K / INTRIN_K > 2)
+        {
+          __syncthreads();
+        }
+        global_to_share_one_stage_A<f16_t, CTA_M, CTA_N, CTA_K, CTA_SIZE, WARP_K / INTRIN_K, STAGES>(A, A_shared + ld_stage * kSmemSizeAPerStage, M, K, cta_offset_m, cta_offset_n, cta_offset_k, k_0_0_ld, iter_k + 1, k_0_0_ld < gemm_iters);
+        global_to_share_one_stage_B<f16_t, CTA_M, CTA_N, CTA_K, CTA_SIZE, WARP_K / INTRIN_K, STAGES>(B, B_shared + ld_stage * kSmemSizeBPerStage, K, cta_offset_m, cta_offset_n, cta_offset_k, k_0_0_ld, iter_k + 1, k_0_0_ld < gemm_iters);
+        global_to_share_one_stage_scales<f16_t, CTA_M, CTA_N, CTA_K, CTA_SIZE, STAGES, G>(
+            scales, scales_shared + (ld_stage / scales_load_interval * scales_per_load) * CTA_N,
+            zeros, zeros_shared + (ld_stage / scales_load_interval * scales_per_load) * CTA_N,
+            N, cta_offset_m, cta_offset_n, cta_offset_k,
+            k_0_0_ld, iter_k, k_0_0_ld < gemm_iters && k_0_0_ld % scales_load_interval == 0);
+        if constexpr (STAGES > 1)
+        {
+          __pipeline_commit();
+          __pipeline_wait_prior(STAGES - 2);
+        }
+        compute_stage = (k_0_0 + 1) % STAGES;
+        __syncthreads();
+      }
+    }
+  }
+  __pipeline_commit();
+  __pipeline_wait_prior(0);
+  __syncthreads();
+  if constexpr (SLICES > 1)
+  {
+#pragma unroll
+    for (int z = 0; z < SLICES; ++z)
+    {
+      if (slice_id == z)
+      {
+#pragma unroll
+        for (int ax0_0_1 = 0; ax0_0_1 < WARP_M / INTRIN_M; ++ax0_0_1)
+        {
+#pragma unroll
+          for (int ax1_0_1 = 0; ax1_0_1 < WARP_N / INTRIN_N; ++ax1_0_1)
+          {
+#pragma unroll
+            for (int local_id = 0; local_id < OP_M * 16 / WARP_SIZE; ++local_id)
+            {
+              if (z > 0)
+              {
+                C_warp[ax0_0_1 * WARP_N / INTRIN_N * 8 + ax1_0_1 * 8 + local_id] += C_shared[warp_offset_m * CTA_N + ax0_0_1 * OP_M * CTA_N + warp_offset_n + ax1_0_1 * 16 + ((local_id % 4) / 2 * 8 + (threadIdx.x / 4)) * CTA_N + (local_id / 4) * 8 + (local_id % 2) + (threadIdx.x % 4) * 2];
+              }
+              C_shared[warp_offset_m * CTA_N + ax0_0_1 * OP_M * CTA_N + warp_offset_n + ax1_0_1 * 16 + ((local_id % 4) / 2 * 8 + (threadIdx.x / 4)) * CTA_N + (local_id / 4) * 8 + (local_id % 2) + (threadIdx.x % 4) * 2] = C_warp[ax0_0_1 * WARP_N / INTRIN_N * 8 + ax1_0_1 * 8 + local_id];
+            };
+          }
+        }
+      }
+      __syncthreads();
+    }
+    if (slice_id == 0)
+    {
+#pragma unroll
+      for (int ax0_0_1 = 0; ax0_0_1 < WARP_M / INTRIN_M; ++ax0_0_1)
+      {
+#pragma unroll
+        for (int ax1_0_1 = 0; ax1_0_1 < WARP_N / INTRIN_N; ++ax1_0_1)
+        {
+#pragma unroll
+          for (int local_id = 0; local_id < OP_M * 16 / WARP_SIZE; ++local_id)
+          {
+            C_warp[ax0_0_1 * WARP_N / INTRIN_N * 8 + ax1_0_1 * 8 + local_id] = C_shared[warp_offset_m * CTA_N + ax0_0_1 * OP_M * CTA_N + warp_offset_n + ax1_0_1 * 16 + ((local_id % 4) / 2 * 8 + (threadIdx.x / 4)) * CTA_N + (local_id / 4) * 8 + (local_id % 2) + (threadIdx.x % 4) * 2];
+          };
+        }
+      }
+    }
+  }
+
+  if (slice_id == 0)
+  {
+    Semaphore semaphore(semaphores + blockIdx_y, threadIdx.x);
+
+    if constexpr (SPLITK > 1)
+    {
+      semaphore.fetch();
+    }
+
+    if (blockIdx_z != 0)
+    {
+      semaphore.wait(blockIdx_z);
+      for (int ax0_0_1 = 0; ax0_0_1 < WARP_M / INTRIN_M; ++ax0_0_1)
+      {
+        for (int ax1_0_1 = 0; ax1_0_1 < WARP_N / INTRIN_N; ++ax1_0_1)
+        {
+          for (int local_id = 0; local_id < OP_M * 16 / WARP_SIZE; local_id += 2)
+          {
+            int write_row = cta_offset_m + warp_offset_m + ax0_0_1 * OP_M + ((local_id % 4) / 2 * 8 + (threadIdx.x / 4));
+
+            if (write_row < M)
+            {
+              f162_t *existing_psum_ptr = reinterpret_cast<f162_t *>(
+                  C + write_row * N +
+                  cta_offset_n + warp_offset_n + ax1_0_1 * 16 +
+                  (local_id / 4) * 8 + (local_id % 2) + (threadIdx.x % 4) * 2);
+
+              *existing_psum_ptr = __hadd2(
+                  *existing_psum_ptr,
+                  cuda_cast<f162_t>(*reinterpret_cast<float2 *>(
+                      C_warp + ax0_0_1 * WARP_N / INTRIN_N * 8 + ax1_0_1 * 8 + local_id)));
+            }
+          };
+        }
+      }
+    }
+    else
+    {
+      for (int ax0_0_1 = 0; ax0_0_1 < WARP_M / INTRIN_M; ++ax0_0_1)
+      {
+        for (int ax1_0_1 = 0; ax1_0_1 < WARP_N / INTRIN_N; ++ax1_0_1)
+        {
+          for (int local_id = 0; local_id < OP_M * 16 / WARP_SIZE; local_id += 2)
+          {
+            int write_row = cta_offset_m + warp_offset_m + ax0_0_1 * OP_M + ((local_id % 4) / 2 * 8 + (threadIdx.x / 4));
+            if (write_row < M)
+            {
+              *reinterpret_cast<f162_t *>(
+                  C + write_row * N +
+                  cta_offset_n + warp_offset_n + ax1_0_1 * 16 +
+                  (local_id / 4) * 8 + (local_id % 2) + (threadIdx.x % 4) * 2) =
+                  cuda_cast<f162_t>(*reinterpret_cast<float2 *>(C_warp + ax0_0_1 * WARP_N / INTRIN_N * 8 +
+                                                                ax1_0_1 * 8 + local_id));
+            }
+          };
+        }
+      }
+    }
+
+    if constexpr (SPLITK > 1)
+    {
+
+      int lock = 0;
+      if (SPLITK == blockIdx_z + 1)
+      {
+
+        lock = 0;
+      }
+      else
+      {
+        lock = blockIdx_z + 1;
+      }
+      semaphore.release(lock);
+    }
+  }
+}
+
+template <typename f16_t, int CTA_M, int CTA_N, int CTA_K, int CTA_SIZE, int SHARED_K_ITERS, int STAGES>
+__device__ __inline__ void global_to_share_one_stage_A_T2(f16_t *src, f16_t *dst, int global_nrows, int global_ncols, int cta_offset_m, int cta_offset_n, int global_iter_k, int shared_iter_k, bool mask)
+{
+  constexpr int threads_needed = (CTA_M * CTA_K) / PACK_SIZE / SHARED_K_ITERS;
+  constexpr int threads_used = threads_needed < CTA_SIZE ? threads_needed : CTA_SIZE;
+  constexpr int total_global_iters = (CTA_M * CTA_K) / PACK_SIZE / threads_used;
+  constexpr int partial_global_iters = (total_global_iters + SHARED_K_ITERS - 1) / SHARED_K_ITERS;
+  constexpr int cta_step_m_or_n = (threads_used * PACK_SIZE) / CTA_K;
+  constexpr int warp_step_m_or_n = (WARP_SIZE * PACK_SIZE) / CTA_K;
+  constexpr int threads_per_row = CTA_K / PACK_SIZE;
+  constexpr int kSmemCol = CTA_K + SMEM_PAD_A;
+  bool local_mask = mask & (threadIdx.y * WARP_SIZE + threadIdx.x < threads_used);
+  int ld_col = (threadIdx.x % threads_per_row);
+#pragma unroll
+  for (int _global_iter = 0; _global_iter < partial_global_iters; ++_global_iter)
+  {
+    int global_iter = shared_iter_k * partial_global_iters + _global_iter;
+    int ld_row = global_iter * cta_step_m_or_n + threadIdx.y * warp_step_m_or_n + (threadIdx.x / threads_per_row);
+    int ld_col_swizzled = (ld_col ^ (ld_row) & 7) * PACK_SIZE;
+    void *dst_ptr = (void *)(dst + ld_row * kSmemCol + ld_col_swizzled);
+    uint4 *src_ptr = (uint4 *)(src + (ld_row + cta_offset_m) * global_ncols + ld_col * PACK_SIZE + global_iter_k * CTA_K); // cta_offset_m * global_ncols + global_iter * cta_step_m_or_n * global_ncols + threadIdx.y * warp_step_m_or_n * global_ncols + (threadIdx.x / threads_per_row) * global_ncols + global_iter_k * CTA_K + (threadIdx.x % threads_per_row) * PACK_SIZE);
+    if constexpr (STAGES > 1)
+    {
+      uint32_t addr = cast_smem_ptr_to_uint(dst_ptr);
+      cp_async_cg_A(addr, src_ptr, local_mask & (ld_row + cta_offset_m < global_nrows));
+    }
+    else
+    {
+      if (local_mask & (ld_row + cta_offset_m < global_nrows))
+        *(uint4 *)dst_ptr = *src_ptr;
+    }
+  }
+}
+
+template <typename f16_t, int CTA_M, int CTA_N, int CTA_K, int CTA_SIZE, int SHARED_K_ITERS, int STAGES>
+__device__ __inline__ void global_to_share_one_stage_B_T2(f16_t *src, f16_t *dst, int global_ncols, int cta_offset_m, int cta_offset_n, int global_iter_k, int shared_iter_k, bool mask)
+{
+  constexpr int threads_needed = (CTA_N / kInterleave * CTA_K) / PACK_SIZE / SHARED_K_ITERS;
+  constexpr int threads_used = threads_needed < CTA_SIZE ? threads_needed : CTA_SIZE;
+  constexpr int total_global_iters = (CTA_N / kInterleave * CTA_K) / PACK_SIZE / threads_used;
+  constexpr int partial_global_iters = (total_global_iters + SHARED_K_ITERS - 1) / SHARED_K_ITERS;
+  constexpr int cta_step_m_or_n = (threads_used * PACK_SIZE) / CTA_K;
+  constexpr int warp_step_m_or_n = (WARP_SIZE * PACK_SIZE) / CTA_K;
+  constexpr int threads_per_row = CTA_K / PACK_SIZE;
+  constexpr int kSmemCol = CTA_K + SMEM_PAD_B;
+  bool local_mask = mask & (threadIdx.y * WARP_SIZE + threadIdx.x < threads_used);
+#pragma unroll
+  for (int _global_iter = 0; _global_iter < partial_global_iters; ++_global_iter)
+  {
+    int global_iter = shared_iter_k * partial_global_iters + _global_iter;
+
+    int ld_row = global_iter * cta_step_m_or_n + threadIdx.y * warp_step_m_or_n + (threadIdx.x / threads_per_row);
+    int ld_col = (threadIdx.x % threads_per_row);
+    int ld_col_swizzled = ld_col ^ (ld_row % 2) & 7;
+    void *dst_ptr = (void *)(dst + (ld_row * kSmemCol + ld_col_swizzled * PACK_SIZE));
+    uint4 *src_ptr = (uint4 *)(src + global_iter_k * CTA_K + cta_offset_n / kInterleave * global_ncols + ld_row * global_ncols + ld_col * PACK_SIZE);
+    if constexpr (STAGES > 1)
+    {
+      uint32_t addr = cast_smem_ptr_to_uint(dst_ptr);
+      cp_async_cg_A(addr, src_ptr, local_mask);
+    }
+    else
+    {
+      if (local_mask)
+        *(uint4 *)dst_ptr = *src_ptr;
+    }
+  }
+}
+
+template <typename f16_t, int CTA_M, int CTA_N, int CTA_K, int CTA_SIZE, int STAGES, int G>
+__device__ __inline__ void global_to_share_one_stage_scales_T2(f16_t *src, f16_t *dst, f16_t *src_z, f16_t *dst_z, int global_ncols, int cta_offset_m, int cta_offset_n, int global_iter_k, int shared_iter_k, bool mask)
+{
+  constexpr int threads_needed = CTA_N / PACK_SIZE / 1;
+  constexpr int threads_used = threads_needed < CTA_SIZE ? threads_needed : CTA_SIZE;
+  constexpr int total_global_iters = CTA_N / PACK_SIZE / threads_used;
+  constexpr int threads_per_row = CTA_N / PACK_SIZE;
+  constexpr int kSmemCol = CTA_N;
+  bool local_mask = mask & (threadIdx.y * WARP_SIZE + threadIdx.x < threads_used);
+  int g_idx = global_iter_k * CTA_K / G;
+
+  void *dst_ptr = (void *)(dst + (threadIdx.x % threads_per_row) * PACK_SIZE);
+  uint4 *src_ptr = (uint4 *)(src + g_idx * global_ncols + cta_offset_n + (threadIdx.x % threads_per_row) * PACK_SIZE);
+  void *dst_ptr_z = (void *)(dst_z + (threadIdx.x % threads_per_row) * PACK_SIZE);
+  uint4 *src_ptr_z = (uint4 *)(src_z + g_idx * global_ncols + cta_offset_n + (threadIdx.x % threads_per_row) * PACK_SIZE);
+  if (STAGES > 1)
+  {
+    uint32_t addr = cast_smem_ptr_to_uint(dst_ptr);
+    cp_async_cg_A(addr, src_ptr, local_mask);
+    uint32_t addr_z = cast_smem_ptr_to_uint(dst_ptr_z);
+    cp_async_cg_A(addr_z, src_ptr_z, local_mask);
+  }
+  else
+  {
+    if (local_mask)
+    {
+      *(uint4 *)dst_ptr = *src_ptr;
+      *(uint4 *)dst_ptr_z = *src_ptr_z;
+    }
+  }
+}
+
+template <typename f16_t, int CTA_M, int CTA_N, int CTA_K, int STAGES, int shared_iters>
+__device__ __inline__ void share_to_reg_one_stage_A_T2(f16_t *src, f16_t *dst, int warp_offset_m, int warp_offset_n, int k_0_1)
+{
+  constexpr int kSmemCol = CTA_K + SMEM_PAD_A;
+
+  for (int shared_iter = 0; shared_iter < shared_iters; ++shared_iter)
+  {
+
+    int ld_row = warp_offset_m + shared_iter * OP_M + (threadIdx.x % 16);
+    int ld_col = k_0_1 * 16 + (threadIdx.x / 16) * 8;
+    int ld_col_swizzled = ((ld_col / PACK_SIZE) ^ (ld_row) & 7) * PACK_SIZE;
+    void *addr_ptr = (void *)(src + ld_row * kSmemCol + ld_col_swizzled);
+
+    uint32_t addr = cast_smem_ptr_to_uint(addr_ptr);
+    ldmatrix_m8n8_x4_b16(dst, shared_iter, addr);
+  }
+}
+
+template <typename f16_t, int CTA_M, int CTA_N, int CTA_K, int STAGES, bool ldmatrix, int shared_iters, int G>
+__device__ __inline__ void share_to_reg_one_stage_B_T2(f16_t *src, f16_t *src_scales, f16_t *src_zeros, f16_t *dst, f16_t *dst_fp16, int warp_offset_m, int warp_offset_n, int k_0_1)
+{
+  using f162_t = typename packed_as<f16_t, 2>::type;
+  constexpr int kSmemCol = CTA_K + SMEM_PAD_B;
+  int r0 = ((threadIdx.x / 8 / 2) * 8 + threadIdx.x % 8);
+  int c0 = ((threadIdx.x / 8) % 2) * 8;
+  int r = r0 / 4;
+  int c = (r0 % 4) * 16 + c0;
+  int c_swizzled = ((c / PACK_SIZE) ^ (r % 2) & 7) * PACK_SIZE;
+
+  if constexpr (ldmatrix)
+  {
+#pragma unroll
+    for (int shared_iter = 0; shared_iter < shared_iters; ++shared_iter)
+    {
+      void *addr_ptr = (void *)(src + warp_offset_n / kInterleave * kSmemCol + shared_iter * 16 / kInterleave * kSmemCol + k_0_1 * 16 + r * kSmemCol + c_swizzled);
+      uint32_t addr = cast_smem_ptr_to_uint(addr_ptr);
+      ldmatrix_m8n8_x4_b16(dst, shared_iter, addr);
+    }
+  }
+
+#pragma unroll
+  for (int shared_iter = 0; shared_iter < shared_iters; ++shared_iter)
+  {
+    f16_t scale = src_scales[warp_offset_n + 16 * shared_iter + 8 * (k_0_1 % 2) + threadIdx.x / 4];
+    f16_t zero = src_zeros[warp_offset_n + 16 * shared_iter + 8 * (k_0_1 % 2) + threadIdx.x / 4];
+    f162_t scale2 = f162f162(scale);
+    f162_t zero2 = f162f162(zero);
+    f162_t loaded[4];
+    dequantize_s4_to_fp16x2(*reinterpret_cast<f162_t *>(dst + (k_0_1 % 2) * 4 + (k_0_1 / 2 * 2) + shared_iter * 8), reinterpret_cast<uint4 *>(loaded));
+#pragma unroll
+    for (int i = 0; i < 4; i++)
+    {
+      loaded[i] = __hfma2(loaded[i], scale2, zero2);
+    }
+    *reinterpret_cast<uint4 *>(dst_fp16 + shared_iter * 16 + 8 * (k_0_1 % 2)) = *reinterpret_cast<uint4 *>(loaded);
+  }
+}
+
+template <typename f16_t, int CTA_M, int CTA_N, int CTA_K, int WARP_M, int WARP_N, int WARP_K, int STAGES, int G>
+__global__ void gemm_w4a16_T2(f16_t *__restrict__ A, f16_t *__restrict__ B, f16_t *__restrict__ scales, f16_t *__restrict__ zeros, f16_t *__restrict__ C, int M, int N, int K)
+{
+  using f162_t = typename packed_as<f16_t, 2>::type;
+  constexpr int NUM_WARPS = CTA_M / WARP_M * CTA_N / WARP_N;
+  constexpr int CTA_SIZE = NUM_WARPS * WARP_SIZE;
+  int num_blocks_n = (N + CTA_N - 1) / CTA_N;
+  int num_blocks_m = (M + CTA_M - 1) / CTA_M;
+  int blockIdx_x = 0;
+  int blockIdx_y = blockIdx.x % (num_blocks_m * num_blocks_n);
+  int blockIdx_z = blockIdx.x / (num_blocks_m * num_blocks_n);
+  const int log_tile = get_log_tile<1>((N + CTA_N - 1) / CTA_N);
+  int blockIdx_m = blockIdx_y / (num_blocks_n >> log_tile);
+  int blockIdx_n = blockIdx_y % (num_blocks_n >> log_tile);
+  const uint2 block_idx_mapping = get_block_idx_mapping(blockIdx_m, blockIdx_n, log_tile);
+  blockIdx_m = block_idx_mapping.x;
+  blockIdx_n = block_idx_mapping.y;
+
+  float C_warp[CTA_M * CTA_N / CTA_SIZE];
+  constexpr int kSmemPadKA = CTA_K + SMEM_PAD_A;
+  constexpr int kSmemPadKB = CTA_K + SMEM_PAD_B;
+  constexpr int kSmemSizeAPerStage = CTA_M * kSmemPadKA;
+  constexpr int kSmemSizeBPerStage = CTA_N / kInterleave * kSmemPadKB;
+  constexpr int kSmemSizeA = kSmemSizeAPerStage * STAGES;
+  constexpr int kSmemSizeB = kSmemSizeBPerStage * STAGES;
+  constexpr int kSmemSizeScales = CTA_N * STAGES / 2;
+  constexpr int kSmemSizeZeros = CTA_N * STAGES / 2;
+  constexpr int scales_load_interval = G / CTA_K;
+  extern __shared__ half mem_shared[];
+  f16_t *A_shared = reinterpret_cast<f16_t *>(mem_shared);
+  f16_t *B_shared = reinterpret_cast<f16_t *>(mem_shared + kSmemSizeA);
+  f16_t *scales_shared = reinterpret_cast<f16_t *>(mem_shared + kSmemSizeA + kSmemSizeB);
+  f16_t *zeros_shared = reinterpret_cast<f16_t *>(mem_shared + kSmemSizeA + kSmemSizeB + kSmemSizeScales);
+  f16_t A_shared_warp_[2][WARP_M * INTRIN_K / WARP_SIZE];
+  f16_t B_shared_warp_[2][WARP_N * 32 / WARP_SIZE];
+  f16_t B_shared_warp_tmp_[2][WARP_N * 16 / WARP_SIZE];
+  int cta_offset_m = blockIdx_m * CTA_M;
+  int cta_offset_n = blockIdx_n * CTA_N;
+  int warp_offset_m = (threadIdx.y % (CTA_M / WARP_M)) * WARP_M;
+  int warp_offset_n = (threadIdx.y / (CTA_M / WARP_M)) * WARP_N;
+
+  for (int i = 0; i < CTA_M * CTA_N / CTA_SIZE; i++)
+    C_warp[i] = 0.0;
+
+  int gemm_iters = (K + CTA_K - 1) / CTA_K;
+  int k_0_0_ld = 0;
+  int k_0_0 = 0;
+  constexpr int prologue_stages = STAGES == 1 ? 1 : STAGES - 1;
+#pragma unroll
+  for (k_0_0_ld = 0; k_0_0_ld < prologue_stages; ++k_0_0_ld)
+  {
+    global_to_share_one_stage_A_T2<f16_t, CTA_M, CTA_N, CTA_K, CTA_SIZE, 1, STAGES>(A, A_shared + k_0_0_ld * kSmemSizeAPerStage, M, K, cta_offset_m, cta_offset_n, k_0_0_ld, 0, true);
+    global_to_share_one_stage_B_T2<f16_t, CTA_M, CTA_N, CTA_K, CTA_SIZE, 1, STAGES>(B, B_shared + k_0_0_ld * kSmemSizeBPerStage, K, cta_offset_m, cta_offset_n, k_0_0_ld, 0, true);
+    global_to_share_one_stage_scales_T2<f16_t, CTA_M, CTA_N, CTA_K, CTA_SIZE, STAGES, G>(
+        scales, scales_shared + (k_0_0_ld / scales_load_interval) * CTA_N,
+        zeros, zeros_shared + (k_0_0_ld / scales_load_interval) * CTA_N,
+        N, cta_offset_m, cta_offset_n, k_0_0_ld, 0, k_0_0_ld < gemm_iters && k_0_0_ld % scales_load_interval == 0);
+    if constexpr (STAGES > 1)
+      __pipeline_commit();
+  }
+  if constexpr (STAGES > 1)
+    __pipeline_wait_prior(STAGES - 2);
+  __syncthreads();
+
+  share_to_reg_one_stage_A_T2<f16_t, CTA_M, CTA_N, CTA_K, STAGES, WARP_M / INTRIN_M>(A_shared, A_shared_warp_[0], warp_offset_m, warp_offset_n, 0);
+  share_to_reg_one_stage_B_T2<f16_t, CTA_M, CTA_N, CTA_K, STAGES, true, WARP_N / INTRIN_N, G>(B_shared, scales_shared, zeros_shared, B_shared_warp_tmp_[0], B_shared_warp_[0], warp_offset_m, warp_offset_n, 0);
+  constexpr int SHARED_K_ITERS = WARP_K / INTRIN_K;
+
+  for (; k_0_0 < gemm_iters; ++k_0_0, ++k_0_0_ld)
+  {
+    int ld_stage = k_0_0_ld % STAGES;
+    int compute_stage = k_0_0 % STAGES;
+    f16_t *A_shared_this_compute_stage;
+    f16_t *B_shared_this_compute_stage;
+    f16_t *scales_shared_this_compute_stage;
+    f16_t *zeros_shared_this_compute_stage;
+
+    for (int iter_k = 0; iter_k < SHARED_K_ITERS; ++iter_k)
+    {
+      A_shared_this_compute_stage = A_shared + compute_stage * kSmemSizeAPerStage;
+      B_shared_this_compute_stage = B_shared + compute_stage * kSmemSizeBPerStage;
+      scales_shared_this_compute_stage = scales_shared + (compute_stage / scales_load_interval) * CTA_N;
+      zeros_shared_this_compute_stage = zeros_shared + (compute_stage / scales_load_interval) * CTA_N;
+      share_to_reg_one_stage_A_T2<f16_t, CTA_M, CTA_N, CTA_K, STAGES, WARP_M / INTRIN_M>(A_shared_this_compute_stage, A_shared_warp_[(iter_k + 1) % 2], warp_offset_m, warp_offset_n, (iter_k + 1) % SHARED_K_ITERS);
+      if ((iter_k + 1) % kInterleave == 0)
+      {
+        if (compute_stage % 2 == 1)
+        {
+          share_to_reg_one_stage_B_T2<f16_t, CTA_M, CTA_N, CTA_K, STAGES, true, WARP_N / INTRIN_N, G>(
+              B_shared_this_compute_stage, scales_shared_this_compute_stage, zeros_shared_this_compute_stage,
+              B_shared_warp_tmp_[1], B_shared_warp_[((iter_k + 1) / 2) % 2],
+              warp_offset_m, warp_offset_n, (iter_k + 1) % SHARED_K_ITERS);
+        }
+        else
+        {
+          share_to_reg_one_stage_B_T2<f16_t, CTA_M, CTA_N, CTA_K, STAGES, true, WARP_N / INTRIN_N, G>(
+              B_shared_this_compute_stage, scales_shared_this_compute_stage, zeros_shared_this_compute_stage,
+              B_shared_warp_tmp_[0], B_shared_warp_[((iter_k + 1) / 2) % 2],
+              warp_offset_m, warp_offset_n, (iter_k + 1) % SHARED_K_ITERS);
+        }
+      }
+      else
+      {
+        if (compute_stage % 2 == 1)
+        {
+          share_to_reg_one_stage_B_T2<f16_t, CTA_M, CTA_N, CTA_K, STAGES, false, WARP_N / INTRIN_N, G>(
+              B_shared_this_compute_stage, scales_shared_this_compute_stage, zeros_shared_this_compute_stage,
+              B_shared_warp_tmp_[1], B_shared_warp_[((iter_k + 1) / 2) % 2],
+              warp_offset_m, warp_offset_n, (iter_k + 1) % SHARED_K_ITERS);
+        }
+        else
+        {
+          share_to_reg_one_stage_B_T2<f16_t, CTA_M, CTA_N, CTA_K, STAGES, false, WARP_N / INTRIN_N, G>(
+              B_shared_this_compute_stage, scales_shared_this_compute_stage, zeros_shared_this_compute_stage,
+              B_shared_warp_tmp_[0], B_shared_warp_[((iter_k + 1) / 2) % 2],
+              warp_offset_m, warp_offset_n, (iter_k + 1) % SHARED_K_ITERS);
+        }
+      }
+      __syncthreads();
+      f16_t *A_shared_warp = A_shared_warp_[iter_k % 2];
+      f16_t *B_shared_warp = B_shared_warp_[(iter_k / 2) % 2];
+      for (int i_0_3 = 0; i_0_3 < WARP_M / INTRIN_M; ++i_0_3)
+      {
+        for (int j_0_4 = 0; j_0_4 < WARP_N / INTRIN_N; ++j_0_4)
+        {
+          mma_m16n8k16(C_warp + i_0_3 * WARP_N / INTRIN_N * 8 + j_0_4 * 8, A_shared_warp + i_0_3 * 8, B_shared_warp + j_0_4 * 16 + (iter_k % 2) * 4);
+          mma_m16n8k16(C_warp + i_0_3 * WARP_N / INTRIN_N * 8 + j_0_4 * 8 + 4, A_shared_warp + i_0_3 * 8, B_shared_warp + j_0_4 * 16 + (iter_k % 2) * 4 + 8);
+        }
+      }
+
+      if (iter_k < WARP_K / INTRIN_K - 1)
+      {
+        if constexpr (STAGES == 1)
+          __syncthreads();
+        global_to_share_one_stage_A_T2<f16_t, CTA_M, CTA_N, CTA_K, CTA_SIZE, WARP_K / INTRIN_K, STAGES>(A, A_shared + ld_stage * kSmemSizeAPerStage, M, K, cta_offset_m, cta_offset_n, k_0_0_ld, iter_k, k_0_0_ld < gemm_iters);
+        global_to_share_one_stage_B_T2<f16_t, CTA_M, CTA_N, CTA_K, CTA_SIZE, WARP_K / INTRIN_K, STAGES>(B, B_shared + ld_stage * kSmemSizeBPerStage, K, cta_offset_m, cta_offset_n, k_0_0_ld, iter_k, k_0_0_ld < gemm_iters);
+      }
+
+      if (iter_k == WARP_K / INTRIN_K - 2)
+      {
+        if constexpr (STAGES == 1 && WARP_K / INTRIN_K > 2)
+        {
+          __syncthreads();
+        }
+        global_to_share_one_stage_A_T2<f16_t, CTA_M, CTA_N, CTA_K, CTA_SIZE, WARP_K / INTRIN_K, STAGES>(A, A_shared + ld_stage * kSmemSizeAPerStage, M, K, cta_offset_m, cta_offset_n, k_0_0_ld, iter_k + 1, k_0_0_ld < gemm_iters);
+        global_to_share_one_stage_B_T2<f16_t, CTA_M, CTA_N, CTA_K, CTA_SIZE, WARP_K / INTRIN_K, STAGES>(B, B_shared + ld_stage * kSmemSizeBPerStage, K, cta_offset_m, cta_offset_n, k_0_0_ld, iter_k + 1, k_0_0_ld < gemm_iters);
+        global_to_share_one_stage_scales_T2<f16_t, CTA_M, CTA_N, CTA_K, CTA_SIZE, STAGES, G>(
+            scales, scales_shared + (ld_stage / scales_load_interval) * CTA_N,
+            zeros, zeros_shared + (ld_stage / scales_load_interval) * CTA_N,
+            N, cta_offset_m, cta_offset_n, k_0_0_ld, iter_k, k_0_0_ld < gemm_iters && k_0_0_ld % scales_load_interval == 0);
+        if constexpr (STAGES > 1)
+        {
+          __pipeline_commit();
+          __pipeline_wait_prior(STAGES - 2);
+        }
+        compute_stage = (k_0_0 + 1) % STAGES;
+        __syncthreads();
+      }
+    }
+  }
+  for (int ax0_0_1 = 0; ax0_0_1 < WARP_M / INTRIN_M; ++ax0_0_1)
+  {
+    for (int ax1_0_1 = 0; ax1_0_1 < WARP_N / INTRIN_N; ++ax1_0_1)
+    {
+      for (int local_id = 0; local_id < OP_M * 16 / WARP_SIZE; local_id += 2)
+      {
+        int write_row = cta_offset_m + warp_offset_m + ax0_0_1 * OP_M + ((local_id % 4) / 2 * 8 + (threadIdx.x / 4));
+        if (write_row < M)
+        {
+          *reinterpret_cast<f162_t *>(
+              C + write_row * N +
+              cta_offset_n + warp_offset_n + ax1_0_1 * 16 +
+              (local_id / 4) * 8 + (local_id % 2) + (threadIdx.x % 4) * 2) =
+              cuda_cast<f162_t>(*reinterpret_cast<float2 *>(C_warp + ax0_0_1 * WARP_N / INTRIN_N * 8 +
+                                                            ax1_0_1 * 8 + local_id));
+        }
+      };
+    }
+  }
+}
+
+Tensor awq_gemm_forward_cuda(
+    Tensor _in_feats,
+    Tensor _kernel,
+    Tensor _scales,
+    Tensor _zeros)
+{
+  auto output_shape = _in_feats.shape.dataExtent;
+  output_shape.back() = _kernel.size(0) * kInterleave;
+  int num_in_feats = _in_feats.numel() / _in_feats.size(-1);
+  int num_in_channels = _in_feats.size(-1);
+  auto options = 
+      Tensor::TensorOptions().dtype(_in_feats.dtype()).device(_in_feats.device());
+  auto options_int = 
+      Tensor::TensorOptions().dtype(Tensor::INT32).device(_in_feats.device());
+  Tensor _out_feats = Tensor::allocate(output_shape, _in_feats.dtype(), _in_feats.device());
+  int num_out_feats = _out_feats.numel() / _out_feats.size(-1);
+  int num_out_channels = _out_feats.size(-1);
+
+  if (_in_feats.scalar_type() == Tensor::FP16)
+  {
+    using f16_t = half;
+
+    auto in_feats = reinterpret_cast<f16_t *>(_in_feats.data_ptr());
+    auto kernel = reinterpret_cast<f16_t *>(_kernel.data_ptr<int16_t>());
+    auto scales = reinterpret_cast<f16_t *>(_scales.data_ptr());
+    auto zeros = reinterpret_cast<f16_t *>(_zeros.data_ptr());
+    auto out_feats = reinterpret_cast<f16_t *>(_out_feats.data_ptr());
+
+    if (num_out_feats <= 32)
+    {
+      constexpr int G = 128;
+      constexpr int CTA_M = 16;
+      constexpr int CTA_N = 128;
+      constexpr int CTA_K = 128;
+      constexpr int WARP_M = 16;
+      constexpr int WARP_N = 32;
+      constexpr int WARP_K = 64;
+      constexpr int SPLITK = 2;
+      constexpr int STAGES = 4;
+      KERNEL_LAUNCH_CODE
+    }
+    else if (num_out_feats <= 64)
+    {
+
+      constexpr int G = 128;
+      constexpr int CTA_M = 16;
+      constexpr int CTA_N = 128;
+      constexpr int CTA_K = 128;
+      constexpr int WARP_M = 16;
+      constexpr int WARP_N = 32;
+      constexpr int WARP_K = 64;
+      constexpr int SPLITK = 1;
+      constexpr int STAGES = 3;
+      KERNEL_LAUNCH_CODE
+    }
+    else if (num_out_feats <= 128)
+    {
+      constexpr int G = 128;
+      constexpr int CTA_M = 32;
+      constexpr int CTA_N = 128;
+      constexpr int CTA_K = 128;
+      constexpr int WARP_M = 32;
+      constexpr int WARP_N = 32;
+      constexpr int WARP_K = 64;
+      constexpr int SPLITK = 1;
+      constexpr int STAGES = 4;
+      KERNEL_LAUNCH_CODE
+    }
+    else if (num_out_feats <= 192)
+    {
+      constexpr int G = 128;
+      constexpr int CTA_M = 64;
+      constexpr int CTA_N = 128;
+      constexpr int CTA_K = 64;
+      constexpr int WARP_M = 64;
+      constexpr int WARP_N = 32;
+      constexpr int WARP_K = 64;
+      constexpr int SPLITK = 1;
+      constexpr int STAGES = 4;
+      KERNEL_LAUNCH_CODE
+    }
+    else
+    {
+      constexpr int G = 128;
+      constexpr int CTA_M = 64;
+      constexpr int CTA_N = 128;
+      constexpr int CTA_K = 64;
+      constexpr int WARP_M = 64;
+      constexpr int WARP_N = 32;
+      constexpr int WARP_K = 64;
+      constexpr int STAGES = 4;
+
+      constexpr int NUM_WARPS = (CTA_M / WARP_M) * (CTA_N / WARP_N);
+      constexpr int kSmemByteSize = (CTA_M * (CTA_K + SMEM_PAD_A) + CTA_N * (CTA_K + SMEM_PAD_B) / kInterleave + CTA_N) * STAGES * sizeof(f16_t);
+      if (kSmemByteSize >= 99 * 1024)
+      {
+        printf("This kernel requires %d Bytes of shared memory, which exceeds device limit.\n", kSmemByteSize);
+        return _out_feats;
+      }
+      int j_factors1 = num_out_channels / CTA_N / 1;
+      dim3 num_blocks((num_out_feats + CTA_M - 1) / CTA_M * j_factors1);
+      dim3 threads_per_block(WARP_SIZE, NUM_WARPS);
+      auto kernel_func = gemm_w4a16_T2<f16_t, CTA_M, CTA_N, CTA_K, WARP_M, WARP_N, WARP_K, STAGES, G>;
+      cudaFuncSetAttribute(kernel_func, cudaFuncAttributeMaxDynamicSharedMemorySize, kSmemByteSize);
+      kernel_func<<<num_blocks, threads_per_block, kSmemByteSize>>>(
+          in_feats, kernel, scales, zeros, out_feats, num_in_feats, num_out_channels, num_in_channels);
+    }
+  }
+  else if (_in_feats.scalar_type() == Tensor::BF16)
+  {
+    using f16_t = __nv_bfloat16;
+
+    auto in_feats = reinterpret_cast<f16_t *>(_in_feats.data_ptr());
+    auto kernel = reinterpret_cast<f16_t *>(_kernel.data_ptr<int16_t>());
+    auto scales = reinterpret_cast<f16_t *>(_scales.data_ptr());
+    auto zeros = reinterpret_cast<f16_t *>(_zeros.data_ptr());
+    auto out_feats = reinterpret_cast<f16_t *>(_out_feats.data_ptr());
+
+    if (num_out_feats <= 32)
+    {
+      constexpr int G = 128;
+      constexpr int CTA_M = 16;
+      constexpr int CTA_N = 128;
+      constexpr int CTA_K = 128;
+      constexpr int WARP_M = 16;
+      constexpr int WARP_N = 32;
+      constexpr int WARP_K = 64;
+      constexpr int SPLITK = 2;
+      constexpr int STAGES = 4;
+      KERNEL_LAUNCH_CODE
+    }
+    else if (num_out_feats <= 64)
+    {
+
+      constexpr int G = 128;
+      constexpr int CTA_M = 16;
+      constexpr int CTA_N = 128;
+      constexpr int CTA_K = 128;
+      constexpr int WARP_M = 16;
+      constexpr int WARP_N = 32;
+      constexpr int WARP_K = 64;
+      constexpr int SPLITK = 1;
+      constexpr int STAGES = 3;
+      KERNEL_LAUNCH_CODE
+    }
+    else if (num_out_feats <= 128)
+    {
+      constexpr int G = 128;
+      constexpr int CTA_M = 32;
+      constexpr int CTA_N = 128;
+      constexpr int CTA_K = 128;
+      constexpr int WARP_M = 32;
+      constexpr int WARP_N = 32;
+      constexpr int WARP_K = 64;
+      constexpr int SPLITK = 1;
+      constexpr int STAGES = 4;
+      KERNEL_LAUNCH_CODE
+    }
+    else if (num_out_feats <= 192)
+    {
+      constexpr int G = 128;
+      constexpr int CTA_M = 64;
+      constexpr int CTA_N = 128;
+      constexpr int CTA_K = 64;
+      constexpr int WARP_M = 64;
+      constexpr int WARP_N = 32;
+      constexpr int WARP_K = 64;
+      constexpr int SPLITK = 1;
+      constexpr int STAGES = 4;
+      KERNEL_LAUNCH_CODE
+    }
+    else
+    {
+      constexpr int G = 128;
+      constexpr int CTA_M = 64;
+      constexpr int CTA_N = 128;
+      constexpr int CTA_K = 64;
+      constexpr int WARP_M = 64;
+      constexpr int WARP_N = 32;
+      constexpr int WARP_K = 64;
+      constexpr int STAGES = 4;
+
+      constexpr int NUM_WARPS = (CTA_M / WARP_M) * (CTA_N / WARP_N);
+      constexpr int kSmemByteSize = (CTA_M * (CTA_K + SMEM_PAD_A) + CTA_N * (CTA_K + SMEM_PAD_B) / kInterleave + CTA_N) * STAGES * sizeof(f16_t);
+      if (kSmemByteSize >= 99 * 1024)
+      {
+        printf("This kernel requires %d Bytes of shared memory, which exceeds device limit.\n", kSmemByteSize);
+        return _out_feats;
+      }
+      int j_factors1 = num_out_channels / CTA_N / 1;
+      dim3 num_blocks((num_out_feats + CTA_M - 1) / CTA_M * j_factors1);
+      dim3 threads_per_block(WARP_SIZE, NUM_WARPS);
+      auto kernel_func = gemm_w4a16_T2<f16_t, CTA_M, CTA_N, CTA_K, WARP_M, WARP_N, WARP_K, STAGES, G>;
+      cudaFuncSetAttribute(kernel_func, cudaFuncAttributeMaxDynamicSharedMemorySize, kSmemByteSize);
+      kernel_func<<<num_blocks, threads_per_block, kSmemByteSize>>>(
+          in_feats, kernel, scales, zeros, out_feats, num_in_feats, num_out_channels, num_in_channels);
+    }
+  }
+  else
+  {
+    throw std::runtime_error("Unsupported input type");
+  }
+
+  return _out_feats;
+}
\ No newline at end of file
--- a/src/kernels/awq/gemm_awq.h
+++ b/src/kernels/awq/gemm_awq.h
+#pragma once
+
+#include "common.h"
+#include "Tensor.h"
+
+
+Tensor awq_gemm_forward_cuda(
+    Tensor _in_feats,
+    Tensor _kernel,
+    Tensor _scales,
+    Tensor _zeros);
--- a/src/kernels/awq/gemv_awq.cu
+++ b/src/kernels/awq/gemv_awq.cu
@@ -307,7 +307,7 @@ Tensor gemv_awq(
            return;
        }
        if constexpr (M > 0) {
-            gemv_kernel<half_t, N_PER_BLOCK, M, BLOCK_SIZE, GROUP_SIZE><<<num_blocks, num_threads>>>(
+            gemv_kernel<half_t, N_PER_BLOCK, M, BLOCK_SIZE, GROUP_SIZE><<<num_blocks, num_threads, 0, getCurrentCUDAStream()>>>(
                in_feats, kernel, scaling_factors, zeros, out_feats, k, n
            );
            checkCUDA(cudaGetLastError());

--- a/src/kernels/awq/semaphore.h
+++ b/src/kernels/awq/semaphore.h
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Implementation of a CTA-wide semaphore for inter-CTA synchronization.
+*/
+
+#pragma once
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// namespace cutlass {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// CTA-wide semaphore for inter-CTA synchronization.
+class Semaphore
+{
+public:
+  int *lock;
+  bool wait_thread;
+  int state;
+
+public:
+  /// Implements a semaphore to wait for a flag to reach a given value
+  __host__ __device__ Semaphore(int *lock_, int thread_id) : lock(lock_),
+                                                             wait_thread(thread_id < 0 || thread_id == 0),
+                                                             state(-1)
+  {
+  }
+
+  /// Permit fetching the synchronization mechanism early
+  __device__ void fetch()
+  {
+    if (wait_thread)
+    {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
+      asm volatile("ld.global.acquire.gpu.b32 %0, [%1];\n" : "=r"(state) : "l"(lock));
+#else
+      asm volatile("ld.global.cg.b32 %0, [%1];\n" : "=r"(state) : "l"(lock));
+#endif
+    }
+  }
+
+  /// Gets the internal state
+  __device__ int get_state() const
+  {
+    return state;
+  }
+
+  /// Waits until the semaphore is equal to the given value
+  __device__ void wait(int status = 0)
+  {
+    while (__syncthreads_and(state != status))
+    {
+      fetch();
+    }
+
+    __syncthreads();
+  }
+
+  /// Updates the lock with the given result
+  __device__ void release(int status = 0)
+  {
+    __syncthreads();
+
+    if (wait_thread)
+    {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
+      asm volatile("st.global.release.gpu.b32 [%0], %1;\n" : : "l"(lock), "r"(status));
+#else
+      asm volatile("st.global.cg.b32 [%0], %1;\n" : : "l"(lock), "r"(status));
+#endif
+    }
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// } // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
--- a/src/kernels/utils.cuh
+++ b/src/kernels/utils.cuh
@@ -349,6 +349,29 @@ __device__ inline __nv_bfloat162 cuda_cast<__nv_bfloat162, half2>(half2 val)

 #endif // ENABLE BF16

+template <typename f16_t>
+__device__ __forceinline__
+    packed_as<f16_t, 2>::type
+    f162f162(f16_t x);
+
+template <>
+__device__ __forceinline__
+    packed_as<half, 2>::type
+    f162f162<half>(half x)
+{
+  return __half2half2(x);
+}
+
+#ifdef ENABLE_BF16
+template <>
+__device__ __forceinline__
+    packed_as<__nv_bfloat16, 2>::type
+    f162f162<__nv_bfloat16>(__nv_bfloat16 x)
+{
+  return __bfloat162bfloat162(x);
+}
+# endif
+
 template <typename To, typename Ti>
 __device__ inline To cuda_sum(Ti val)
 {

--- a/src/kernels/zgemm/gemm_w4a4.cuh
+++ b/src/kernels/zgemm/gemm_w4a4.cuh
@@ -1440,10 +1440,10 @@ public:
        static constexpr int ROTARY_EMB_NUM_ELEMENTS = 2;   // 1 for theta, 2 for {sin, cos} pair

        __device__ __forceinline__
-        static void apply(fpsum_warp fpsum, half_t *out, int M, int N, int K, half_t *pool_out, const float *rotary_emb, const half_t *rmsnorm_weight, float epsilon) {
+        static void apply(fpsum_warp fpsum, half_t *out, int M, int N, int K, half_t *pool_out, const float *rotary_emb, const half_t *rmsnorm_weight, float epsilon, int maxRows) {
            const int laneId = threadIdx.x % WARP_SIZE;
            const int warpId = threadIdx.x / WARP_SIZE;
-            
+
            __shared__ alignas(128) uint8_t shmem[NUM_WARPS][ceilDiv(unpack_fpsum::SHMEM_SIZE, 128) * 128];

            constexpr int PACK_SIZE = unpack_fpsum::PACK_SIZE;
@@ -1470,9 +1470,9 @@ public:

            CHECK_NAN(fpsum, "fpsum");

-            unpack_fpsum()(fpsum, out + warpId * WARP_M * N, N, INT_MAX, INT_MAX, shmem[warpId], [&](int rowId, pack_t &pack) ALWAYSINLINE {
+            unpack_fpsum()(fpsum, out + warpId * WARP_M * N, N, maxRows - warpId * WARP_M, INT_MAX, shmem[warpId], [&](int rowId, pack_t &pack) ALWAYSINLINE {
                // load rope
-                pack_rope_t rope; 
+                pack_rope_t rope;
                if (laneId < LANES_PER_HEAD) {
                    // freq = load(reinterpret_cast<pack_freq_t *>(&freqs_cis[(warpId * WARP_M + rowId) * HEAD_DIM * 2 + laneId * PACK_SIZE * 2]));
                    rope = load(reinterpret_cast<const pack_rope_t *>(&rotary_emb_base_addr[rowId * HEAD_DIM / 2 * ROTARY_EMB_NUM_ELEMENTS]));
@@ -1508,7 +1508,7 @@ public:
                // rope
                for (int i = 0; i < PACK_SIZE; i += 2) {
                    float2 pack2 = half22float2(half2_t(pack[i], pack[i+1]));
-                    
+
                    CHECK_NAN(freq[i].x, "rope.freq");
                    CHECK_NAN(freq[i].y, "rope.freq");
                    CHECK_NAN(freq[i+1].x, "rope.freq");
@@ -1519,7 +1519,7 @@ public:
                    // pack[i] = tmp.x;
                    // pack[i+1] = tmp.y;

-                    // printf("block.x=%d block.y=%d warpId=%d rowId=%d (%d) freqs = %f %f %f %f\n", 
+                    // printf("block.x=%d block.y=%d warpId=%d rowId=%d (%d) freqs = %f %f %f %f\n",
                    //     blockIdx.x, blockIdx.y, warpId, rowId,
                    //     blockIdx.x * BLOCK_M + warpId * WARP_M + rowId,
                    //     (float)freq[i].x, (float)freq[i].y, (float)freq[i+1].x, (float)freq[i+1].y
@@ -1579,7 +1579,7 @@ public:
                for (int j = 0; j < PACK_SIZE; j++) {
                    reduce_tmp[j] /= PoolSize;
                }
-                
+
                store(reinterpret_cast<pack_t *>(pool_out + warpId * N), reduce_tmp);
            }
            __syncthreads();
@@ -1599,13 +1599,14 @@ public:

            if (is_q || is_k) {
                apply(
-                    fpsum, 
+                    fpsum,
                    args.out + bm * BLOCK_M * args.actualN + bn * BLOCK_N,
-                    M, N, K, 
+                    M, N, K,
                    args.pool_out ? args.pool_out + bm * BLOCK_M / PoolSize * N : nullptr,
                    args.rotary_emb + bm * BLOCK_M * (HEAD_DIM / 2 * ROTARY_EMB_NUM_ELEMENTS),
                    is_q ? args.rmsnorm_weight_q : args.rmsnorm_weight_k,
-                    args.epsilon
+                    args.epsilon,
+                    args.actualM - bm * BLOCK_M
                );
            } else {
                EpilogueDefault()(binfo, fpsum, M, N, K, typename EpilogueDefault::Arguments{

--- a/src/kernels/zgemm/gemm_w4a4_launch.cuh
+++ b/src/kernels/zgemm/gemm_w4a4_launch.cuh
@@ -5,8 +5,13 @@ namespace nunchaku::kernels {
 template<typename Config>
 class GEMM_W4A4_Launch {
    using GEMM = GEMM_W4A4<Config>;
-    using LoraRanks = std::integer_sequence<int, 0, 32, 48, 64, 80, 96>;
-    // using LoraRanks = std::integer_sequence<int, 32>;
+//     using LoraRanks = std::integer_sequence<int, 0, 32>;
+    using LoraRanks = std::integer_sequence<int, 0, 32, 48, 64, 80, 96, 112, 128, 160, 176, 224>;
+//     using LoraRanks = std::integer_sequence<int,
+//     0, 32, 48, 64, 80, 96, 112, 128, 144, 160,
+//     176, 192, 208, 224, 240, 256, 272, 288, 304, 320,
+//     336, 352, 368, 384, 400, 416, 432, 448, 464, 480,
+//     496, 512>;

    using packed_act_t    = typename GEMM::packed_act_t;
    using packed_wgt_t    = typename GEMM::packed_wgt_t;

--- a/src/kernels/zgemm/gemm_w4a4_launch_impl.cuh
+++ b/src/kernels/zgemm/gemm_w4a4_launch_impl.cuh
@@ -97,7 +97,7 @@ void GEMM_W4A4_Launch<GEMMConfig_W4A4_FP16>::gemm_w4a4(

                    assert(alpha == 1.0f);
                    
-                    func<<<grid, GEMM::WARP_SIZE * GEMM::NUM_WARPS, shmem>>>(
+                    func<<<grid, GEMM::WARP_SIZE * GEMM::NUM_WARPS, shmem, getCurrentCUDAStream()>>>(
                        act.data_ptr<packed_act_t>(),
                        wgt.data_ptr<packed_wgt_t>(),
                        ascales.data_ptr<packed_ascale_t>(),
@@ -134,7 +134,7 @@ void GEMM_W4A4_Launch<GEMMConfig_W4A4_FP16>::gemm_w4a4(
                    assert(ascales.dtype() == Tensor::FP8_E4M3);
                    assert(wscales.dtype() == Tensor::FP8_E4M3);
                    
-                    func<<<grid, GEMM::WARP_SIZE * GEMM::NUM_WARPS, shmem>>>(
+                    func<<<grid, GEMM::WARP_SIZE * GEMM::NUM_WARPS, shmem, getCurrentCUDAStream()>>>(
                        act.data_ptr<packed_act_t>(),
                        wgt.data_ptr<packed_wgt_t>(),
                        ascales.data_ptr<packed_amscale_t>(),
@@ -375,7 +375,7 @@ void GEMM_W4A4_Launch<Config>::linearattn_vk_mul_q(Tensor q, Tensor vk) {
        BLOCK_SIZE = 128;
    }

-    invoke_kernel<typename Epilogue::vk_mul_q_kernel><<<dim3(ceilDiv(num_tokens, BLOCK_SIZE), num_heads, batch_size), BLOCK_SIZE>>>(
+    invoke_kernel<typename Epilogue::vk_mul_q_kernel><<<dim3(ceilDiv(num_tokens, BLOCK_SIZE), num_heads, batch_size), BLOCK_SIZE, 0, getCurrentCUDAStream()>>>(
        q.data_ptr<half_t>(),
        vk.data_ptr<float>(),
        1e-6f,
@@ -428,7 +428,7 @@ void GEMM_W4A4_Launch<Config>::quantize_w4a4_act_fuse_lora(Tensor input, Tensor

                // log(std::format("quantize_w4a4_act_fuse_lora M={} N={} input={} output={} (size={} numel={})", M, N, input.data_ptr(), output.data_ptr(), output.buffer->getSize(), output.numel()));

-                func<<<grid, GEMM::WARP_SIZE * GEMM::NUM_WARPS, kernel::SHMEM_SIZE>>>(
+                func<<<grid, GEMM::WARP_SIZE * GEMM::NUM_WARPS, kernel::SHMEM_SIZE, getCurrentCUDAStream()>>>(
                    typename kernel::Arguments{
                        .input = input.data_ptr<half_t>(),
                        .smooth_factor = smooth.valid() ? smooth.data_ptr<packed_wscale_t>() : nullptr,
@@ -462,7 +462,7 @@ void GEMM_W4A4_Launch<Config>::quantize_w4a4_act(Tensor input, Tensor output, Te
    assert(oscales.numel() == M * K / GEMM::WARP_K);

    dim3 grid(M / GEMM::WARP_M, K / GEMM::WARP_K);
-    invoke_kernel<typename GEMM::quantize_w4a4_act_kernel><<<grid, GEMM::WARP_SIZE>>>(
+    invoke_kernel<typename GEMM::quantize_w4a4_act_kernel><<<grid, GEMM::WARP_SIZE, 0, getCurrentCUDAStream()>>>(
        input.data_ptr<half_t>(),
        output.data_ptr<packed_act_t>(),
        oscales.data_ptr<packed_ascale_t>(),
@@ -486,7 +486,7 @@ void GEMM_W4A4_Launch<Config>::quantize_w4a4_wgt(Tensor input, Tensor output, Te
    assert(oscales.numel() == N * K / GEMM::WARP_K);

    dim3 grid(N / GEMM::WARP_N, K / GEMM::WARP_K);
-    invoke_kernel<typename GEMM::quantize_w4a4_wgt_kernel><<<grid, GEMM::WARP_SIZE>>>(
+    invoke_kernel<typename GEMM::quantize_w4a4_wgt_kernel><<<grid, GEMM::WARP_SIZE, 0, getCurrentCUDAStream()>>>(
        input.data_ptr<half_t>(),
        output.data_ptr<packed_wgt_t>(),
        oscales.data_ptr<packed_wscale_t>(),

--- a/tests/__init__.py
+++ b/tests/__init__.py
--- a/tests/data/MJHQ/MJHQ.py
+++ b/tests/data/MJHQ/MJHQ.py
+import json
+import os
+import random
+
+import datasets
+from PIL import Image
+
+_CITATION = """\
+@misc{li2024playground,
+      title={Playground v2.5: Three Insights towards Enhancing Aesthetic Quality in Text-to-Image Generation}, 
+      author={Daiqing Li and Aleks Kamko and Ehsan Akhgari and Ali Sabet and Linmiao Xu and Suhail Doshi},
+      year={2024},
+      eprint={2402.17245},
+      archivePrefix={arXiv},
+      primaryClass={cs.CV}
+}
+"""
+
+_DESCRIPTION = """\
+We introduce a new benchmark, MJHQ-30K, for automatic evaluation of a model’s aesthetic quality. 
+The benchmark computes FID on a high-quality dataset to gauge aesthetic quality.
+"""
+
+_HOMEPAGE = "https://huggingface.co/datasets/playgroundai/MJHQ-30K"
+
+_LICENSE = (
+    "Playground v2.5 Community License "
+    "(https://huggingface.co/playgroundai/playground-v2.5-1024px-aesthetic/blob/main/LICENSE.md)"
+)
+
+IMAGE_URL = "https://huggingface.co/datasets/playgroundai/MJHQ-30K/resolve/main/mjhq30k_imgs.zip"
+
+META_URL = "https://huggingface.co/datasets/playgroundai/MJHQ-30K/resolve/main/meta_data.json"
+
+
+class MJHQConfig(datasets.BuilderConfig):
+    def __init__(self, max_dataset_size: int = -1, return_gt: bool = False, **kwargs):
+        super(MJHQConfig, self).__init__(
+            name=kwargs.get("name", "default"),
+            version=kwargs.get("version", "0.0.0"),
+            data_dir=kwargs.get("data_dir", None),
+            data_files=kwargs.get("data_files", None),
+            description=kwargs.get("description", None),
+        )
+        self.max_dataset_size = max_dataset_size
+        self.return_gt = return_gt
+
+
+class DCI(datasets.GeneratorBasedBuilder):
+    VERSION = datasets.Version("0.0.0")
+
+    BUILDER_CONFIG_CLASS = MJHQConfig
+    BUILDER_CONFIGS = [MJHQConfig(name="MJHQ", version=VERSION, description="MJHQ-30K full dataset")]
+    DEFAULT_CONFIG_NAME = "MJHQ"
+
+    def _info(self):
+        features = datasets.Features(
+            {
+                "filename": datasets.Value("string"),
+                "category": datasets.Value("string"),
+                "image": datasets.Image(),
+                "prompt": datasets.Value("string"),
+                "prompt_path": datasets.Value("string"),
+                "image_root": datasets.Value("string"),
+                "image_path": datasets.Value("string"),
+                "split": datasets.Value("string"),
+            }
+        )
+        return datasets.DatasetInfo(
+            description=_DESCRIPTION, features=features, homepage=_HOMEPAGE, license=_LICENSE, citation=_CITATION
+        )
+
+    def _split_generators(self, dl_manager: datasets.download.DownloadManager):
+        meta_path = dl_manager.download(META_URL)
+        image_root = dl_manager.download_and_extract(IMAGE_URL)
+        return [
+            datasets.SplitGenerator(
+                name=datasets.Split.TRAIN, gen_kwargs={"meta_path": meta_path, "image_root": image_root}
+            ),
+        ]
+
+    def _generate_examples(self, meta_path: str, image_root: str):
+
+        with open(meta_path, "r") as f:
+            meta = json.load(f)
+
+        names = list(meta.keys())
+        if self.config.max_dataset_size > 0:
+            random.Random(0).shuffle(names)
+            names = names[: self.config.max_dataset_size]
+            names = sorted(names)
+
+        for i, name in enumerate(names):
+            category = meta[name]["category"]
+            prompt = meta[name]["prompt"]
+            image_path = os.path.join(image_root, category, f"{name}.jpg")
+            yield i, {
+                "filename": name,
+                "category": category,
+                "image": Image.open(image_path) if self.config.return_gt else None,
+                "prompt": prompt,
+                "meta_path": meta_path,
+                "image_root": image_root,
+                "image_path": image_path,
+                "split": self.config.name,
+            }
--- a/tests/data/__init__.py
+++ b/tests/data/__init__.py
+import os
+import random
+
+import datasets
+import yaml
+from nunchaku.utils import fetch_or_download
+
+__all__ = ["get_dataset"]
+
+
+def load_dataset_yaml(meta_path: str, max_dataset_size: int = -1, repeat: int = 4) -> dict:
+    meta = yaml.safe_load(open(meta_path, "r"))
+    names = list(meta.keys())
+    if max_dataset_size > 0:
+        random.Random(0).shuffle(names)
+        names = names[:max_dataset_size]
+        names = sorted(names)
+
+    ret = {"filename": [], "prompt": [], "meta_path": []}
+    idx = 0
+    for name in names:
+        prompt = meta[name]
+        for j in range(repeat):
+            ret["filename"].append(f"{name}-{j}")
+            ret["prompt"].append(prompt)
+            ret["meta_path"].append(meta_path)
+            idx += 1
+    return ret
+
+
+def get_dataset(
+    name: str,
+    config_name: str | None = None,
+    split: str = "train",
+    return_gt: bool = False,
+    max_dataset_size: int = 5000,
+) -> datasets.Dataset:
+    prefix = os.path.dirname(__file__)
+    kwargs = {
+        "name": config_name,
+        "split": split,
+        "trust_remote_code": True,
+        "token": True,
+        "max_dataset_size": max_dataset_size,
+    }
+    path = os.path.join(prefix, f"{name}")
+    if name == "MJHQ":
+        dataset = datasets.load_dataset(path, return_gt=return_gt, **kwargs)
+    else:
+        dataset = datasets.Dataset.from_dict(
+            load_dataset_yaml(
+                fetch_or_download(f"mit-han-lab/nunchaku-test/{name}.yaml", repo_type="dataset"),
+                max_dataset_size=max_dataset_size,
+                repeat=1,
+            ),
+            features=datasets.Features(
+                {
+                    "filename": datasets.Value("string"),
+                    "prompt": datasets.Value("string"),
+                    "meta_path": datasets.Value("string"),
+                }
+            ),
+        )
+    return dataset
--- a/tests/flux/__init__.py
+++ b/tests/flux/__init__.py
--- a/tests/flux/test_flux_tools.py
+++ b/tests/flux/test_flux_tools.py
+import torch
+from controlnet_aux import CannyDetector
+from diffusers import FluxControlPipeline, FluxFillPipeline, FluxPipeline, FluxPriorReduxPipeline
+from diffusers.utils import load_image
+from image_gen_aux import DepthPreprocessor
+
+from nunchaku import NunchakuFluxTransformer2dModel
+
+
+def test_flux_dev_canny():
+    transformer = NunchakuFluxTransformer2dModel.from_pretrained("mit-han-lab/svdq-int4-flux.1-canny-dev")
+    pipe = FluxControlPipeline.from_pretrained(
+        "black-forest-labs/FLUX.1-Canny-dev", transformer=transformer, torch_dtype=torch.bfloat16
+    ).to("cuda")
+
+    prompt = "A robot made of exotic candies and chocolates of different kinds. The background is filled with confetti and celebratory gifts."
+    control_image = load_image(
+        "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/robot.png"
+    )
+
+    processor = CannyDetector()
+    control_image = processor(
+        control_image, low_threshold=50, high_threshold=200, detect_resolution=1024, image_resolution=1024
+    )
+
+    image = pipe(
+        prompt=prompt, control_image=control_image, height=1024, width=1024, num_inference_steps=50, guidance_scale=30.0
+    ).images[0]
+    image.save("flux.1-canny-dev.png")
+
+
+def test_flux_dev_depth():
+    transformer = NunchakuFluxTransformer2dModel.from_pretrained("mit-han-lab/svdq-int4-flux.1-depth-dev")
+
+    pipe = FluxControlPipeline.from_pretrained(
+        "black-forest-labs/FLUX.1-Depth-dev",
+        transformer=transformer,
+        torch_dtype=torch.bfloat16,
+    ).to("cuda")
+
+    prompt = "A robot made of exotic candies and chocolates of different kinds. The background is filled with confetti and celebratory gifts."
+    control_image = load_image(
+        "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/robot.png"
+    )
+
+    processor = DepthPreprocessor.from_pretrained("LiheYoung/depth-anything-large-hf")
+    control_image = processor(control_image)[0].convert("RGB")
+
+    image = pipe(
+        prompt=prompt, control_image=control_image, height=1024, width=1024, num_inference_steps=30, guidance_scale=10.0
+    ).images[0]
+    image.save("flux.1-depth-dev.png")
+
+
+def test_flux_dev_fill():
+    image = load_image("https://huggingface.co/mit-han-lab/svdq-int4-flux.1-fill-dev/resolve/main/example.png")
+    mask = load_image("https://huggingface.co/mit-han-lab/svdq-int4-flux.1-fill-dev/resolve/main/mask.png")
+
+    transformer = NunchakuFluxTransformer2dModel.from_pretrained("mit-han-lab/svdq-int4-flux.1-fill-dev")
+    pipe = FluxFillPipeline.from_pretrained(
+        "black-forest-labs/FLUX.1-Fill-dev", transformer=transformer, torch_dtype=torch.bfloat16
+    ).to("cuda")
+    image = pipe(
+        prompt="A wooden basket of a cat.",
+        image=image,
+        mask_image=mask,
+        height=1024,
+        width=1024,
+        guidance_scale=30,
+        num_inference_steps=50,
+        max_sequence_length=512,
+    ).images[0]
+    image.save("flux.1-fill-dev.png")
+
+
+def test_flux_dev_redux():
+    pipe_prior_redux = FluxPriorReduxPipeline.from_pretrained(
+        "black-forest-labs/FLUX.1-Redux-dev", torch_dtype=torch.bfloat16
+    ).to("cuda")
+    transformer = NunchakuFluxTransformer2dModel.from_pretrained("mit-han-lab/svdq-int4-flux.1-dev")
+    pipe = FluxPipeline.from_pretrained(
+        "black-forest-labs/FLUX.1-dev",
+        text_encoder=None,
+        text_encoder_2=None,
+        transformer=transformer,
+        torch_dtype=torch.bfloat16,
+    ).to("cuda")
+
+    image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/robot.png")
+    pipe_prior_output = pipe_prior_redux(image)
+    images = pipe(guidance_scale=2.5, num_inference_steps=50, **pipe_prior_output).images
+    images[0].save("flux.1-redux-dev.png")
--- a/tests/flux/test_t2i.py
+++ b/tests/flux/test_t2i.py
+import os
+import tempfile
+
+import pytest
+import torch
+from diffusers import FluxPipeline
+from peft.tuners import lora
+from safetensors.torch import save_file
+from tqdm import tqdm
+
+from nunchaku import NunchakuFluxTransformer2dModel, NunchakuT5EncoderModel
+from nunchaku.lora.flux import comfyui2diffusers, convert_to_nunchaku_flux_lowrank_dict, detect_format, xlab2diffusers
+from ..data import get_dataset
+from ..utils import already_generate, compute_lpips, hash_str_to_int
+
+
+def run_pipeline(dataset, pipeline: FluxPipeline, save_dir: str, forward_kwargs: dict = {}):
+    os.makedirs(save_dir, exist_ok=True)
+    pipeline.set_progress_bar_config(desc="Sampling", leave=False, dynamic_ncols=True, position=1)
+    for row in tqdm(dataset):
+        filename = row["filename"]
+        prompt = row["prompt"]
+        seed = hash_str_to_int(filename)
+        image = pipeline(prompt, generator=torch.Generator().manual_seed(seed), **forward_kwargs).images[0]
+        image.save(os.path.join(save_dir, f"{filename}.png"))
+
+
+@pytest.mark.parametrize(
+    "precision,height,width,num_inference_steps,guidance_scale,use_qencoder,cpu_offload,max_dataset_size,expected_lpips",
+    [
+        ("int4", 1024, 1024, 4, 0, False, False, 16, 0.258),
+        ("int4", 1024, 1024, 4, 0, True, False, 16, 0.41),
+        ("int4", 1024, 1024, 4, 0, True, False, 16, 0.41),
+        ("int4", 1920, 1080, 4, 0, False, False, 16, 0.258),
+        ("int4", 600, 800, 4, 0, False, False, 16, 0.29),
+    ],
+)
+def test_flux_schnell(
+    precision: str,
+    height: int,
+    width: int,
+    num_inference_steps: int,
+    guidance_scale: float,
+    use_qencoder: bool,
+    cpu_offload: bool,
+    max_dataset_size: int,
+    expected_lpips: float,
+):
+    dataset = get_dataset(name="MJHQ", max_dataset_size=max_dataset_size)
+    save_root = os.path.join("results", "schnell", f"w{width}h{height}t{num_inference_steps}g{guidance_scale}")
+
+    save_dir_16bit = os.path.join(save_root, "bf16")
+    if not already_generate(save_dir_16bit, max_dataset_size):
+        pipeline = FluxPipeline.from_pretrained("black-forest-labs/FLUX.1-schnell", torch_dtype=torch.bfloat16)
+        pipeline = pipeline.to("cuda")
+
+        run_pipeline(
+            dataset,
+            pipeline,
+            save_dir=save_dir_16bit,
+            forward_kwargs={
+                "height": height,
+                "width": width,
+                "num_inference_steps": num_inference_steps,
+                "guidance_scale": guidance_scale,
+            },
+        )
+        del pipeline
+        # release the gpu memory
+        torch.cuda.empty_cache()
+
+    save_dir_4bit = os.path.join(
+        save_root, f"{precision}-qencoder" if use_qencoder else f"{precision}" + ("-cpuoffload" if cpu_offload else "")
+    )
+    if not already_generate(save_dir_4bit, max_dataset_size):
+        pipeline_init_kwargs = {}
+        if precision == "int4":
+            transformer = NunchakuFluxTransformer2dModel.from_pretrained(
+                "mit-han-lab/svdq-int4-flux.1-schnell", offload=cpu_offload
+            )
+        else:
+            assert precision == "fp4"
+            transformer = NunchakuFluxTransformer2dModel.from_pretrained(
+                "mit-han-lab/svdq-fp4-flux.1-schnell", precision="fp4", offload=cpu_offload
+            )
+        pipeline_init_kwargs["transformer"] = transformer
+        if use_qencoder:
+            text_encoder_2 = NunchakuT5EncoderModel.from_pretrained("mit-han-lab/svdq-flux.1-t5")
+            pipeline_init_kwargs["text_encoder_2"] = text_encoder_2
+        pipeline = FluxPipeline.from_pretrained(
+            "black-forest-labs/FLUX.1-schnell", torch_dtype=torch.bfloat16, **pipeline_init_kwargs
+        )
+        pipeline = pipeline.to("cuda")
+        if cpu_offload:
+            pipeline.enable_sequential_cpu_offload()
+        run_pipeline(
+            dataset,
+            pipeline,
+            save_dir=save_dir_4bit,
+            forward_kwargs={
+                "height": height,
+                "width": width,
+                "num_inference_steps": num_inference_steps,
+                "guidance_scale": guidance_scale,
+            },
+        )
+        del pipeline
+        # release the gpu memory
+        torch.cuda.empty_cache()
+    lpips = compute_lpips(save_dir_16bit, save_dir_4bit)
+    print(f"lpips: {lpips}")
+    assert lpips < expected_lpips * 1.05
+
+
+LORA_PATH_MAP = {
+    "hypersd8": "ByteDance/Hyper-SD/Hyper-FLUX.1-dev-8steps-lora.safetensors",
+    "realism": "XLabs-AI/flux-RealismLora/lora.safetensors",
+    "ghibsky": "aleksa-codes/flux-ghibsky-illustration/lora.safetensors",
+    "anime": "alvdansen/sonny-anime-fixed/araminta_k_sonnyanime_fluxd_fixed.safetensors",
+    "sketch": "Shakker-Labs/FLUX.1-dev-LoRA-Children-Simple-Sketch/FLUX-dev-lora-children-simple-sketch.safetensors",
+    "yarn": "linoyts/yarn_art_Flux_LoRA/pytorch_lora_weights.safetensors",
+    "haunted_linework": "alvdansen/haunted_linework_flux/hauntedlinework_flux_araminta_k.safetensors",
+}
+
+
+def run_test_flux_dev(
+    precision: str,
+    height: int,
+    width: int,
+    num_inference_steps: int,
+    guidance_scale: float,
+    use_qencoder: bool,
+    cpu_offload: bool,
+    lora_name: str | None,
+    lora_scale: float,
+    max_dataset_size: int,
+    expected_lpips: float,
+):
+    save_root = os.path.join(
+        "results",
+        "dev",
+        f"w{width}h{height}t{num_inference_steps}g{guidance_scale}"
+        + ("-qencoder" if use_qencoder else "")
+        + (f"-{lora_name}_{lora_scale:.1f}" if lora_name else ""),
+    )
+    dataset = get_dataset(
+        name="MJHQ" if lora_name in [None, "hypersd8"] else lora_name, max_dataset_size=max_dataset_size
+    )
+
+    save_dir_16bit = os.path.join(save_root, "bf16")
+    if not already_generate(save_dir_16bit, max_dataset_size):
+        pipeline = FluxPipeline.from_pretrained("black-forest-labs/FLUX.1-dev", torch_dtype=torch.bfloat16)
+        pipeline = pipeline.to("cuda")
+        if lora_name is not None:
+            pipeline.load_lora_weights(
+                os.path.dirname(LORA_PATH_MAP[lora_name]),
+                weight_name=os.path.basename(LORA_PATH_MAP[lora_name]),
+                adapter_name="lora",
+            )
+            for n, m in pipeline.transformer.named_modules():
+                if isinstance(m, lora.LoraLayer):
+                    for name in m.scaling.keys():
+                        m.scaling[name] = lora_scale
+
+        run_pipeline(
+            dataset,
+            pipeline,
+            save_dir=save_dir_16bit,
+            forward_kwargs={
+                "height": height,
+                "width": width,
+                "num_inference_steps": num_inference_steps,
+                "guidance_scale": guidance_scale,
+            },
+        )
+        del pipeline
+        # release the gpu memory
+        torch.cuda.empty_cache()
+
+    save_dir_4bit = os.path.join(save_root, f"{precision}-qencoder" if use_qencoder else f"{precision}")
+    if not already_generate(save_dir_4bit, max_dataset_size):
+        pipeline_init_kwargs = {}
+        if precision == "int4":
+            transformer = NunchakuFluxTransformer2dModel.from_pretrained(
+                "mit-han-lab/svdq-int4-flux.1-dev", offload=cpu_offload
+            )
+        else:
+            assert precision == "fp4"
+            transformer = NunchakuFluxTransformer2dModel.from_pretrained(
+                "mit-han-lab/svdq-fp4-flux.1-dev", precision="fp4", offload=cpu_offload
+            )
+        if lora_name is not None:
+            lora_path = LORA_PATH_MAP[lora_name]
+            lora_format = detect_format(lora_path)
+            if lora_format != "svdquant":
+                if lora_format == "comfyui":
+                    input_lora = comfyui2diffusers(lora_path)
+                elif lora_format == "xlab":
+                    input_lora = xlab2diffusers(lora_path)
+                elif lora_format == "diffusers":
+                    input_lora = lora_path
+                else:
+                    raise ValueError(f"Invalid LoRA format {lora_format}.")
+                state_dict = convert_to_nunchaku_flux_lowrank_dict(
+                    "mit-han-lab/svdq-int4-flux.1-dev/transformer_blocks.safetensors", input_lora
+                )
+                with tempfile.NamedTemporaryFile(suffix=".safetensors", delete=True) as tmp_file:
+                    save_file(state_dict, tmp_file.name)
+                    transformer.update_lora_params(tmp_file.name)
+            else:
+                transformer.update_lora_params(lora_path)
+            transformer.set_lora_strength(lora_scale)
+
+        pipeline_init_kwargs["transformer"] = transformer
+        if use_qencoder:
+            text_encoder_2 = NunchakuT5EncoderModel.from_pretrained("mit-han-lab/svdq-flux.1-t5")
+            pipeline_init_kwargs["text_encoder_2"] = text_encoder_2
+        pipeline = FluxPipeline.from_pretrained(
+            "black-forest-labs/FLUX.1-dev", torch_dtype=torch.bfloat16, **pipeline_init_kwargs
+        )
+        pipeline = pipeline.to("cuda")
+        if cpu_offload:
+            pipeline.enable_sequential_cpu_offload()
+        run_pipeline(
+            dataset,
+            pipeline,
+            save_dir=save_dir_4bit,
+            forward_kwargs={
+                "height": height,
+                "width": width,
+                "num_inference_steps": num_inference_steps,
+                "guidance_scale": guidance_scale,
+            },
+        )
+        del pipeline
+        # release the gpu memory
+        torch.cuda.empty_cache()
+    lpips = compute_lpips(save_dir_16bit, save_dir_4bit)
+    print(f"lpips: {lpips}")
+    assert lpips < expected_lpips * 1.05
+
+
+@pytest.mark.parametrize("cpu_offload", [False, True])
+def test_flux_dev_base(cpu_offload: bool):
+    run_test_flux_dev(
+        precision="int4",
+        height=1024,
+        width=1024,
+        num_inference_steps=50,
+        guidance_scale=3.5,
+        use_qencoder=False,
+        cpu_offload=cpu_offload,
+        lora_name=None,
+        lora_scale=0,
+        max_dataset_size=8,
+        expected_lpips=0.16,
+    )
+
+
+def test_flux_dev_qencoder_800x600():
+    run_test_flux_dev(
+        precision="int4",
+        height=800,
+        width=600,
+        num_inference_steps=50,
+        guidance_scale=3.5,
+        use_qencoder=True,
+        cpu_offload=False,
+        lora_name=None,
+        lora_scale=0,
+        max_dataset_size=8,
+        expected_lpips=0.36,
+    )
+
+
+def test_flux_dev_hypersd8_1080x1920():
+    run_test_flux_dev(
+        precision="int4",
+        height=1080,
+        width=1920,
+        num_inference_steps=8,
+        guidance_scale=3.5,
+        use_qencoder=False,
+        cpu_offload=False,
+        lora_name="hypersd8",
+        lora_scale=0.125,
+        max_dataset_size=8,
+        expected_lpips=0.44,
+    )
+
+
+@pytest.mark.parametrize(
+    "num_inference_steps,lora_name,lora_scale,cpu_offload,expected_lpips",
+    [
+        (25, "realism", 0.9, False, 0.16),
+        (25, "ghibsky", 1, False, 0.16),
+        (28, "anime", 1, False, 0.27),
+        (24, "sketch", 1, False, 0.35),
+        (28, "yarn", 1, False, 0.22),
+        (25, "haunted_linework", 1, False, 0.34),
+    ],
+)
+def test_flux_dev_loras(num_inference_steps, lora_name, lora_scale, cpu_offload, expected_lpips):
+    run_test_flux_dev(
+        precision="int4",
+        height=1024,
+        width=1024,
+        num_inference_steps=num_inference_steps,
+        guidance_scale=3.5,
+        use_qencoder=False,
+        cpu_offload=cpu_offload,
+        lora_name=lora_name,
+        lora_scale=lora_scale,
+        max_dataset_size=8,
+        expected_lpips=expected_lpips,
+    )
+
+
+@pytest.mark.parametrize(
+    "use_qencoder,cpu_offload,memory_limit",
+    [
+        (False, False, 17),
+        (False, True, 13),
+        (True, False, 12),
+        (True, True, 6),
+    ],
+)
+def test_flux_schnell_memory(use_qencoder: bool, cpu_offload: bool, memory_limit: float):
+    torch.cuda.reset_peak_memory_stats()
+    pipeline_init_kwargs = {
+        "transformer": NunchakuFluxTransformer2dModel.from_pretrained(
+            "mit-han-lab/svdq-int4-flux.1-schnell", offload=cpu_offload
+        )
+    }
+    if use_qencoder:
+        text_encoder_2 = NunchakuT5EncoderModel.from_pretrained("mit-han-lab/svdq-flux.1-t5")
+        pipeline_init_kwargs["text_encoder_2"] = text_encoder_2
+    pipeline = FluxPipeline.from_pretrained(
+        "black-forest-labs/FLUX.1-schnell", torch_dtype=torch.bfloat16, **pipeline_init_kwargs
+    ).to("cuda")
+
+    if cpu_offload:
+        pipeline.enable_sequential_cpu_offload()
+
+    pipeline(
+        "A cat holding a sign that says hello world", width=1024, height=1024, num_inference_steps=50, guidance_scale=0
+    )
+    memory = torch.cuda.max_memory_reserved(0) / 1024**3
+    assert memory < memory_limit
+    del pipeline
+    # release the gpu memory
+    torch.cuda.empty_cache()
--- a/tests/requirements.txt
+++ b/tests/requirements.txt
+pytest
+datasets
+torchmetrics
+mediapipe
+controlnet_aux
+peft
+git+https://github.com/asomoza/image_gen_aux.git
\ No newline at end of file