merge v0.4.1

99b471c2 · zhuwenwen · 1925d2e9 · 468d761b · 99b471c2 · 99b471c2
Commit 99b471c2 authored May 21, 2024 by zhuwenwen
20 changed files
--- a/csrc/cpu/cpu_types.hpp
+++ b/csrc/cpu/cpu_types.hpp
+#ifndef CPU_TYPES_HPP
+#define CPU_TYPES_HPP
+#include <immintrin.h>
+#include <torch/extension.h>
+namespace vec_op {
+// FIXME: FP16 is not fully supported in Torch-CPU
+#define VLLM_DISPATCH_CASE_FLOATING_TYPES(...)                                 \
+  AT_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__)                         \
+  AT_DISPATCH_CASE(at::ScalarType::BFloat16, __VA_ARGS__)
+#define VLLM_DISPATCH_FLOATING_TYPES(TYPE, NAME, ...)                          \
+  AT_DISPATCH_SWITCH(TYPE, NAME, VLLM_DISPATCH_CASE_FLOATING_TYPES(__VA_ARGS__))
+#ifndef CPU_OP_GUARD
+#define CPU_KERNEL_GUARD_IN(NAME)
+#define CPU_KERNEL_GUARD_OUT(NAME)
+#else
+#define CPU_KERNEL_GUARD_IN(NAME)                                              \
+  std::cout << #NAME << " invoked." << std::endl;
+#define CPU_KERNEL_GUARD_OUT(NAME) std::cout << #NAME << " exit." << std::endl;
+#endif
+#define FORCE_INLINE __attribute__((always_inline)) inline
+namespace {
+template <typename T, T... indexes, typename F>
+constexpr void unroll_loop_item(std::integer_sequence<T, indexes...>, F &&f) {
+  (f(std::integral_constant<T, indexes>{}), ...);
+}
+}; // namespace
+template <typename T, T count, typename F,
+          typename = std::enable_if_t<std::is_invocable_v<F, T>>>
+constexpr void unroll_loop(F &&f) {
+  unroll_loop_item(std::make_integer_sequence<T, count>{}, std::forward<F>(f));
+}
+template <typename T> struct Vec {
+  constexpr static int get_elem_num() { return T::VEC_ELEM_NUM; }
+};
+struct FP32Vec8;
+struct FP32Vec16;
+#ifdef __AVX512FP16__
+struct FP16Vec8 : public Vec<FP16Vec8> {
+  constexpr static int VEC_ELEM_NUM = 8;
+  __m128h reg;
+  explicit FP16Vec8(_Float16 v) : reg(_mm_set1_ph(v)) {}
+  explicit FP16Vec8(const void *ptr) : reg(_mm_loadu_ph(ptr)) {}
+  explicit FP16Vec8(__m128h data) : reg(data) {}
+  FP16Vec8 operator*(const FP16Vec8 &b) const {
+    return FP16Vec8(_mm_mul_ph(reg, b.reg));
+  }
+  FP16Vec8 operator+(const FP16Vec8 &b) const {
+    return FP16Vec8(_mm_add_ph(reg, b.reg));
+  }
+  FP16Vec8 operator-(const FP16Vec8 &b) const {
+    return FP16Vec8(_mm_sub_ph(reg, b.reg));
+  }
+  FP16Vec8 operator/(const FP16Vec8 &b) const {
+    return FP16Vec8(_mm_div_ph(reg, b.reg));
+  }
+  void save(void *ptr) const { _mm_storeu_ph(ptr, reg); }
+};
+#endif
+struct BF16Vec8 : public Vec<BF16Vec8> {
+  constexpr static int VEC_ELEM_NUM = 8;
+  __m128i reg;
+  explicit BF16Vec8(const void *ptr)
+      : reg((__m128i)_mm_loadu_si128((__m128i *)ptr)) {}
+  explicit BF16Vec8(const FP32Vec8 &);
+  void save(void *ptr) const { *reinterpret_cast<__m128i *>(ptr) = reg; }
+};
+struct BF16Vec16 : public Vec<BF16Vec16> {
+  constexpr static int VEC_ELEM_NUM = 16;
+  __m256i reg;
+  explicit BF16Vec16(const void *ptr)
+      : reg((__m256i)_mm256_loadu_si256((__m256i *)ptr)) {}
+  explicit BF16Vec16(const FP32Vec16 &);
+  void save(void *ptr) const { *reinterpret_cast<__m256i *>(ptr) = reg; }
+};
+struct BF16Vec32 : public Vec<BF16Vec32> {
+  constexpr static int VEC_ELEM_NUM = 32;
+  __m512i reg;
+  explicit BF16Vec32(const void *ptr) : reg((__m512i)_mm512_loadu_si512(ptr)) {}
+  explicit BF16Vec32(__m512i data) : reg(data) {}
+  explicit BF16Vec32(BF16Vec8 &vec8_data)
+      : reg((__m512i)_mm512_inserti32x4(
+            _mm512_inserti32x4(_mm512_inserti32x4(_mm512_castsi128_si512(
+                                                      (__m128i)vec8_data.reg),
+                                                  (__m128i)vec8_data.reg, 1),
+                               (__m128i)vec8_data.reg, 2),
+            (__m128i)vec8_data.reg, 3)) {}
+  void save(void *ptr) const { *reinterpret_cast<__m512i *>(ptr) = reg; }
+};
+struct FP32Vec4 : public Vec<FP32Vec4> {
+  constexpr static int VEC_ELEM_NUM = 4;
+  union AliasReg {
+    __m128 reg;
+    float values[VEC_ELEM_NUM];
+  };
+  __m128 reg;
+  explicit FP32Vec4(float v) : reg(_mm_set1_ps(v)) {}
+  explicit FP32Vec4() : reg(_mm_set1_ps(0.0)) {}
+  explicit FP32Vec4(const float *ptr) : reg(_mm_loadu_ps(ptr)) {}
+  explicit FP32Vec4(__m128 data) : reg(data) {}
+  explicit FP32Vec4(const FP32Vec4 &data) : reg(data.reg) {}
+};
+struct FP32Vec8 : public Vec<FP32Vec8> {
+  constexpr static int VEC_ELEM_NUM = 8;
+  union AliasReg {
+    __m256 reg;
+    float values[VEC_ELEM_NUM];
+  };
+  __m256 reg;
+  explicit FP32Vec8(float v) : reg(_mm256_set1_ps(v)) {}
+  explicit FP32Vec8() : reg(_mm256_set1_ps(0.0)) {}
+  explicit FP32Vec8(const float *ptr) : reg(_mm256_loadu_ps(ptr)) {}
+  explicit FP32Vec8(__m256 data) : reg(data) {}
+  explicit FP32Vec8(const FP32Vec8 &data) : reg(data.reg) {}
+#ifdef __AVX512FP16__
+  explicit FP32Vec8(__m128h v) : reg(_mm256_cvtph_ps(_mm_castph_si128(v))) {}
+#endif
+  explicit FP32Vec8(const BF16Vec8 &v)
+      : reg(_mm256_castsi256_ps(
+            _mm256_bslli_epi128(_mm256_cvtepu16_epi32(v.reg), 2))) {}
+  float reduce_sum() const {
+    AliasReg ar;
+    ar.reg = reg;
+    float result = 0;
+    unroll_loop<int, VEC_ELEM_NUM>([&result, &ar](int i) { result += ar.values[i]; });
+    return result;
+  }
+  FP32Vec8 exp() const {
+    AliasReg ar;
+    ar.reg = reg;
+    return FP32Vec8(_mm256_set_ps(expf(ar.values[7]), expf(ar.values[6]),
+                                  expf(ar.values[5]), expf(ar.values[4]),
+                                  expf(ar.values[3]), expf(ar.values[2]),
+                                  expf(ar.values[1]), expf(ar.values[0])));
+  }
+  FP32Vec8 tanh() const {
+    AliasReg ar;
+    ar.reg = reg;
+    return FP32Vec8(_mm256_set_ps(tanhf(ar.values[7]), tanhf(ar.values[6]),
+                                  tanhf(ar.values[5]), tanhf(ar.values[4]),
+                                  tanhf(ar.values[3]), tanhf(ar.values[2]),
+                                  tanhf(ar.values[1]), tanhf(ar.values[0])));
+  }
+  FP32Vec8 er() const {
+    AliasReg ar;
+    ar.reg = reg;
+    return FP32Vec8(_mm256_set_ps(erf(ar.values[7]), erf(ar.values[6]),
+                                  erf(ar.values[5]), erf(ar.values[4]),
+                                  erf(ar.values[3]), erf(ar.values[2]),
+                                  erf(ar.values[1]), erf(ar.values[0])));
+  }
+  FP32Vec8 operator*(const FP32Vec8 &b) const {
+    return FP32Vec8(_mm256_mul_ps(reg, b.reg));
+  }
+  FP32Vec8 operator+(const FP32Vec8 &b) const {
+    return FP32Vec8(_mm256_add_ps(reg, b.reg));
+  }
+  FP32Vec8 operator-(const FP32Vec8 &b) const {
+    return FP32Vec8(_mm256_sub_ps(reg, b.reg));
+  }
+  FP32Vec8 operator/(const FP32Vec8 &b) const {
+    return FP32Vec8(_mm256_div_ps(reg, b.reg));
+  }
+  void save(float *ptr) const { _mm256_storeu_ps(ptr, reg); }
+};
+struct FP32Vec16 : public Vec<FP32Vec16> {
+  constexpr static int VEC_ELEM_NUM = 16;
+  union AliasReg {
+    __m512 reg;
+    float values[VEC_ELEM_NUM];
+  };
+  __m512 reg;
+  explicit FP32Vec16(float v) : reg(_mm512_set1_ps(v)) {}
+  explicit FP32Vec16() : reg(_mm512_set1_ps(0.0)) {}
+  explicit FP32Vec16(const float *ptr) : reg(_mm512_loadu_ps(ptr)) {}
+  explicit FP32Vec16(__m512 data) : reg(data) {}
+  explicit FP32Vec16(const FP32Vec16 &data) : reg(data.reg) {}
+  explicit FP32Vec16(const FP32Vec4 &data)
+      : reg((__m512)_mm512_inserti32x4(
+            _mm512_inserti32x4(
+                _mm512_inserti32x4(_mm512_castsi128_si512((__m128i)data.reg),
+                                   (__m128i)data.reg, 1),
+                (__m128i)data.reg, 2),
+            (__m128i)data.reg, 3)) {}
+  explicit FP32Vec16(const FP32Vec8 &data)
+      : reg((__m512)_mm512_inserti32x8(
+            _mm512_castsi256_si512((__m256i)data.reg), (__m256i)data.reg, 1)) {}
+  explicit FP32Vec16(const BF16Vec16 &v)
+      : reg(_mm512_castsi512_ps(
+            _mm512_bslli_epi128(_mm512_cvtepu16_epi32(v.reg), 2))) {}
+  explicit FP32Vec16(const BF16Vec8 &v) : FP32Vec16(FP32Vec8(v)) {}
+  FP32Vec16 operator*(const FP32Vec16 &b) const {
+    return FP32Vec16(_mm512_mul_ps(reg, b.reg));
+  }
+  FP32Vec16 operator+(const FP32Vec16 &b) const {
+    return FP32Vec16(_mm512_add_ps(reg, b.reg));
+  }
+  FP32Vec16 operator-(const FP32Vec16 &b) const {
+    return FP32Vec16(_mm512_sub_ps(reg, b.reg));
+  }
+  FP32Vec16 operator/(const FP32Vec16 &b) const {
+    return FP32Vec16(_mm512_div_ps(reg, b.reg));
+  }
+  float reduce_sum() const { return _mm512_reduce_add_ps(reg); }
+  template <int group_size> float reduce_sub_sum(int idx) {
+    static_assert(VEC_ELEM_NUM % group_size == 0);
+    constexpr uint32_t base_mask = (0xFFFF >> (16 - group_size));
+    __mmask16 mask = _cvtu32_mask16(base_mask << (idx * group_size));
+    return _mm512_mask_reduce_add_ps(mask, reg);
+  }
+  void save(float *ptr) const { _mm512_storeu_ps(ptr, reg); }
+};
+template <typename T> struct VecType { using vec_type = void; };
+template <typename T> using vec_t = typename VecType<T>::vec_type;
+template <> struct VecType<float> { using vec_type = FP32Vec8; };
+#ifdef __AVX512FP16__
+template <> struct VecType<c10::Half> { using vec_type = FP16Vec16; };
+#endif
+template <> struct VecType<c10::BFloat16> { using vec_type = BF16Vec8; };
+template <typename T> void storeFP32(float v, T *ptr) { *ptr = v; }
+#ifdef __AVX512FP16__
+template <> inline void storeFP32<c10::Half>(float v, c10::Half *ptr) {
+  *reinterpret_cast<_Float16 *>(ptr) = v;
+}
+#endif
+inline void fma(FP32Vec16 &acc, FP32Vec16 &a, FP32Vec16 &b) {
+  acc = acc + a * b;
+}
+#ifdef __AVX512BF16__
+template <> inline void storeFP32<c10::BFloat16>(float v, c10::BFloat16 *ptr) {
+  *reinterpret_cast<__bfloat16 *>(ptr) = _mm_cvtness_sbh(v);
+}
+inline BF16Vec8::BF16Vec8(const FP32Vec8 &v)
+    : reg((__m128i)_mm256_cvtneps_pbh(v.reg)) {}
+inline BF16Vec16::BF16Vec16(const FP32Vec16 &v)
+    : reg((__m256i)_mm512_cvtneps_pbh(v.reg)) {}
+inline void fma(FP32Vec16 &acc, BF16Vec32 &a, BF16Vec32 &b) {
+  acc.reg = _mm512_dpbf16_ps(acc.reg, (__m512bh)a.reg, (__m512bh)b.reg);
+}
+#else
+template <> inline void storeFP32<c10::BFloat16>(float v, c10::BFloat16 *ptr) {
+  c10::BFloat16 __attribute__((__may_alias__)) *v_ptr =
+      reinterpret_cast<c10::BFloat16 *>(&v);
+  *ptr = *(v_ptr + 1);
+}
+inline BF16Vec8::BF16Vec8(const FP32Vec8 &v)
+    : reg(_mm256_cvtepi32_epi16(
+          _mm256_bsrli_epi128(_mm256_castps_si256(v.reg), 2))) {}
+inline BF16Vec16::BF16Vec16(const FP32Vec16 &v)
+    : reg(_mm512_cvtepi32_epi16(
+          _mm512_bsrli_epi128(_mm512_castps_si512(v.reg), 2))) {}
+#endif
+inline void prefetch(const void *addr) { _mm_prefetch(addr, _MM_HINT_T1); }
+}; // namespace vec_op
+#endif
--- a/csrc/cpu/layernorm.cpp
+++ b/csrc/cpu/layernorm.cpp
+#include "cpu_types.hpp"
+namespace {
+template <typename scalar_t>
+void rms_norm_impl(scalar_t *__restrict__ out,
+                       const scalar_t *__restrict__ input,
+                       const scalar_t *__restrict__ weight, const float epsilon,
+                       const int num_tokens, const int hidden_size) {
+  using scalar_vec_t = vec_op::vec_t<scalar_t>;
+  constexpr int VEC_ELEM_NUM = scalar_vec_t::get_elem_num();
+  TORCH_CHECK(hidden_size % VEC_ELEM_NUM == 0);
+#pragma omp parallel for
+  for (int i = 0; i < num_tokens; ++i) {
+    vec_op::FP32Vec8 variance(0.0);
+    auto input_p = input + i * hidden_size;
+    auto output_p = out + i * hidden_size;
+    for (int j = 0; j < hidden_size; j += VEC_ELEM_NUM) {
+      scalar_vec_t x(input_p + j);
+      vec_op::FP32Vec8 fp32_x(x);
+      variance = variance + fp32_x * fp32_x;
+    }
+    float s_variance =
+        1.0f / sqrtf(variance.reduce_sum() / (float)hidden_size + epsilon);
+    vec_op::FP32Vec8 fp32_s_variance(s_variance);
+    for (int j = 0; j < hidden_size; j += VEC_ELEM_NUM) {
+      scalar_vec_t x(input_p + j);
+      scalar_vec_t w(weight + j);
+      vec_op::FP32Vec8 fp32_x(x);
+      vec_op::FP32Vec8 fp32_w(w);
+      vec_op::FP32Vec8 fp32_out = fp32_x * fp32_s_variance * fp32_w;
+      scalar_vec_t out(fp32_out);
+      out.save(output_p + j);
+    }
+  }
+}
+template <typename scalar_t>
+void fused_add_rms_norm_impl(scalar_t *__restrict__ input,
+                                 scalar_t *__restrict__ residual,
+                                 const scalar_t *__restrict__ weight,
+                                 const float epsilon, const int num_tokens,
+                                 const int hidden_size) {
+  using scalar_vec_t = vec_op::vec_t<scalar_t>;
+  constexpr int VEC_ELEM_NUM = scalar_vec_t::get_elem_num();
+  TORCH_CHECK(hidden_size % VEC_ELEM_NUM == 0);
+#pragma omp parallel for
+  for (int i = 0; i < num_tokens; ++i) {
+    vec_op::FP32Vec8 variance(0.0);
+    auto input_p = input + i * hidden_size;
+    auto residual_p = residual + i * hidden_size;
+    for (int j = 0; j < hidden_size; j += VEC_ELEM_NUM) {
+      scalar_vec_t x(input_p + j);
+      scalar_vec_t res(residual_p + j);
+      vec_op::FP32Vec8 fp32_x(x);
+      vec_op::FP32Vec8 fp32_res(res);
+      fp32_x = fp32_x + fp32_res;
+      variance = variance + fp32_x * fp32_x;
+      scalar_vec_t out(fp32_x);
+      out.save(residual_p + j);
+    }
+    float s_variance =
+        1.0f / sqrtf(variance.reduce_sum() / (float)hidden_size + epsilon);
+    vec_op::FP32Vec8 fp32_s_variance(s_variance);
+    for (int j = 0; j < hidden_size; j += VEC_ELEM_NUM) {
+      scalar_vec_t w(weight + j);
+      scalar_vec_t res(residual_p + j);
+      vec_op::FP32Vec8 fp32_w(w);
+      vec_op::FP32Vec8 fp32_res(res);
+      vec_op::FP32Vec8 fp32_out = fp32_res * fp32_s_variance * fp32_w;
+      scalar_vec_t out(fp32_out);
+      out.save(input_p + j);
+    }
+  }
+}
+} // namespace
+void rms_norm(torch::Tensor &out, torch::Tensor &input,
+                  torch::Tensor &weight, float epsilon) {
+  int hidden_size = input.size(-1);
+  int num_tokens = input.numel() / hidden_size;
+  VLLM_DISPATCH_FLOATING_TYPES(input.scalar_type(), "rms_norm_impl", [&] {
+    CPU_KERNEL_GUARD_IN(rms_norm_impl)
+    rms_norm_impl(out.data_ptr<scalar_t>(), input.data_ptr<scalar_t>(),
+                      weight.data_ptr<scalar_t>(), epsilon, num_tokens,
+                      hidden_size);
+    CPU_KERNEL_GUARD_OUT(rms_norm_impl)
+  });
+}
+void fused_add_rms_norm(torch::Tensor &input, torch::Tensor &residual,
+                            torch::Tensor &weight, float epsilon) {
+  int hidden_size = input.size(-1);
+  int num_tokens = input.numel() / hidden_size;
+  VLLM_DISPATCH_FLOATING_TYPES(
+      input.scalar_type(), "fused_add_rms_norm_impl", [&] {
+        CPU_KERNEL_GUARD_IN(fused_add_rms_norm_impl)
+        fused_add_rms_norm_impl(
+            input.data_ptr<scalar_t>(), residual.data_ptr<scalar_t>(),
+            weight.data_ptr<scalar_t>(), epsilon, num_tokens, hidden_size);
+        CPU_KERNEL_GUARD_OUT(fused_add_rms_norm_impl)
+      });
+}
--- a/csrc/cpu/pos_encoding.cpp
+++ b/csrc/cpu/pos_encoding.cpp
+#include "cpu_types.hpp"
+namespace {
+template <typename scalar_t>
+void rotary_embedding_impl(
+    const int64_t
+        *__restrict__ positions, // [batch_size, seq_len] or [num_tokens]
+    scalar_t
+        *__restrict__ query, /// [batch_size, seq_len, num_heads, head_size] or
+                             /// [num_tokens, num_heads, head_size]
+    scalar_t
+        *__restrict__ key, // [batch_size, seq_len, num_kv_heads, head_size] or
+                           // [num_tokens, num_kv_heads, head_size]
+    const scalar_t
+        *__restrict__ cos_sin_cache, // [max_position, 2, rot_dim // 2]
+    const int rot_dim, const int64_t query_stride, const int64_t key_stride,
+    const int num_heads, const int num_kv_heads, const int head_size,
+    const int num_tokens) {
+  using scalar_vec_t = vec_op::vec_t<scalar_t>;
+  constexpr int VEC_ELEM_NUM = scalar_vec_t::get_elem_num();
+  constexpr int ELEM_SIZE = sizeof(scalar_t);
+  const int embed_dim = rot_dim / 2;
+  TORCH_CHECK(embed_dim % VEC_ELEM_NUM == 0);
+#pragma omp parallel for
+  for (int token_idx = 0; token_idx < num_tokens; ++token_idx) {
+    int64_t pos = positions[token_idx];
+    const scalar_t *cache_ptr = cos_sin_cache + pos * rot_dim;
+    for (int i = 0; i < num_heads; ++i) {
+      const int head_idx = i;
+      const int64_t token_head =
+          token_idx * query_stride + head_idx * head_size;
+      for (int j = 0; j < embed_dim; j += VEC_ELEM_NUM) {
+        const int rot_offset = j;
+        const int x_index = rot_offset;
+        const int y_index = embed_dim + rot_offset;
+        const int64_t out_x = token_head + x_index;
+        const int64_t out_y = token_head + y_index;
+        const scalar_vec_t cos(cache_ptr + x_index);
+        const scalar_vec_t sin(cache_ptr + y_index);
+        const scalar_vec_t q_x(query + out_x);
+        const scalar_vec_t q_y(query + out_y);
+        vec_op::FP32Vec8 fp32_cos(cos);
+        vec_op::FP32Vec8 fp32_sin(sin);
+        vec_op::FP32Vec8 fp32_q_x(q_x);
+        vec_op::FP32Vec8 fp32_q_y(q_y);
+        auto out1 = fp32_q_x * fp32_cos - fp32_q_y * fp32_sin;
+        scalar_vec_t(out1).save(query + out_x);
+        auto out2 = fp32_q_y * fp32_cos + fp32_q_x * fp32_sin;
+        scalar_vec_t(out2).save(query + out_y);
+      }
+    }
+    for (int i = 0; i < num_kv_heads; ++i) {
+      const int head_idx = i;
+      const int64_t token_head = token_idx * key_stride + head_idx * head_size;
+      for (int j = 0; j < embed_dim; j += VEC_ELEM_NUM) {
+        const int rot_offset = j;
+        const int x_index = rot_offset;
+        const int y_index = embed_dim + rot_offset;
+        const int64_t out_x = token_head + x_index;
+        const int64_t out_y = token_head + y_index;
+        const scalar_vec_t cos(cache_ptr + x_index);
+        const scalar_vec_t sin(cache_ptr + y_index);
+        const scalar_vec_t k_x(key + out_x);
+        const scalar_vec_t k_y(key + out_y);
+        vec_op::FP32Vec8 fp32_cos(cos);
+        vec_op::FP32Vec8 fp32_sin(sin);
+        vec_op::FP32Vec8 fp32_k_x(k_x);
+        vec_op::FP32Vec8 fp32_k_y(k_y);
+        auto out1 = fp32_k_x * fp32_cos - fp32_k_y * fp32_sin;
+        scalar_vec_t(out1).save(key + out_x);
+        auto out2 = fp32_k_y * fp32_cos + fp32_k_x * fp32_sin;
+        scalar_vec_t(out2).save(key + out_y);
+      }
+    }
+  }
+}
+template <typename scalar_t>
+void rotary_embedding_gptj_impl(
+    const int64_t
+        *__restrict__ positions, // [batch_size, seq_len] or [num_tokens]
+    scalar_t
+        *__restrict__ query, /// [batch_size, seq_len, num_heads, head_size] or
+                             /// [num_tokens, num_heads, head_size]
+    scalar_t
+        *__restrict__ key, // [batch_size, seq_len, num_kv_heads, head_size] or
+                           // [num_tokens, num_kv_heads, head_size]
+    const scalar_t
+        *__restrict__ cos_sin_cache, // [max_position, 2, rot_dim // 2]
+    const int rot_dim, const int64_t query_stride, const int64_t key_stride,
+    const int num_heads, const int num_kv_heads, const int head_size,
+    const int num_tokens) {
+  const int embed_dim = rot_dim / 2;
+#pragma omp parallel for collapse(2)
+  for (int token_idx = 0; token_idx < num_tokens; ++token_idx) {
+    for (int i = 0; i < num_heads; ++i) {
+      int64_t pos = positions[token_idx];
+      const scalar_t *cache_ptr = cos_sin_cache + pos * rot_dim;
+      const scalar_t *cos_cache_ptr = cache_ptr;
+      const scalar_t *sin_cache_ptr = cache_ptr + embed_dim;
+      const int head_idx = i;
+      const int64_t token_head =
+          token_idx * query_stride + head_idx * head_size;
+      scalar_t *head_query = token_head + query;
+      for (int j = 0; j < embed_dim; j += 1) {
+        const int rot_offset = j;
+        const int x_index = 2 * rot_offset;
+        const int y_index = 2 * rot_offset + 1;
+        const float cos = cos_cache_ptr[rot_offset];
+        const float sin = sin_cache_ptr[rot_offset];
+        const float x = head_query[x_index];
+        const float y = head_query[y_index];
+        head_query[x_index] = x * cos - y * sin;
+        head_query[y_index] = y * cos + x * sin;
+      }
+    }
+  }
+#pragma omp parallel for collapse(2)
+  for (int token_idx = 0; token_idx < num_tokens; ++token_idx) {
+    for (int i = 0; i < num_kv_heads; ++i) {
+      int64_t pos = positions[token_idx];
+      const scalar_t *cache_ptr = cos_sin_cache + pos * rot_dim;
+      const scalar_t *cos_cache_ptr = cache_ptr;
+      const scalar_t *sin_cache_ptr = cache_ptr + embed_dim;
+      const int head_idx = i;
+      const int64_t token_head = token_idx * key_stride + head_idx * head_size;
+      scalar_t *head_key = key + token_head;
+      for (int j = 0; j < embed_dim; j += 1) {
+        const int rot_offset = j;
+        const int x_index = 2 * rot_offset;
+        const int y_index = 2 * rot_offset + 1;
+        const float cos = cos_cache_ptr[rot_offset];
+        const float sin = sin_cache_ptr[rot_offset];
+        const float x = head_key[x_index];
+        const float y = head_key[y_index];
+        head_key[x_index] = x * cos - y * sin;
+        head_key[y_index] = y * cos + x * sin;
+      }
+    }
+  }
+}
+}; // namespace
+void rotary_embedding(torch::Tensor &positions, torch::Tensor &query,
+                          torch::Tensor &key, int head_size,
+                          torch::Tensor &cos_sin_cache, bool is_neox) {
+  int num_tokens = query.numel() / query.size(-1);
+  int rot_dim = cos_sin_cache.size(1);
+  int num_heads = query.size(-1) / head_size;
+  int num_kv_heads = key.size(-1) / head_size;
+  int64_t key_stride = key.stride(-2);
+  int64_t query_stride = query.stride(-2);
+  VLLM_DISPATCH_FLOATING_TYPES(
+      query.scalar_type(), "rotary_embedding_impl", [&] {
+        CPU_KERNEL_GUARD_IN(rotary_embedding_impl)
+        if (is_neox) {
+          rotary_embedding_impl(
+              positions.data_ptr<int64_t>(), query.data_ptr<scalar_t>(),
+              key.data_ptr<scalar_t>(), cos_sin_cache.data_ptr<scalar_t>(),
+              rot_dim, query_stride, key_stride, num_heads, num_kv_heads,
+              head_size, num_tokens);
+        } else {
+          rotary_embedding_gptj_impl(
+              positions.data_ptr<int64_t>(), query.data_ptr<scalar_t>(),
+              key.data_ptr<scalar_t>(), cos_sin_cache.data_ptr<scalar_t>(),
+              rot_dim, query_stride, key_stride, num_heads, num_kv_heads,
+              head_size, num_tokens);
+        }
+        CPU_KERNEL_GUARD_OUT(rotary_embedding_impl)
+      });
+}
--- a/csrc/cpu/pybind.cpp
+++ b/csrc/cpu/pybind.cpp
+#include "cache.h"
+#include "cuda_utils.h"
+#include "ops.h"
+#include <torch/extension.h>
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+  // vLLM custom ops
+  pybind11::module ops = m.def_submodule("ops", "vLLM custom operators");
+  // Attention ops
+  ops.def(
+    "paged_attention_v1",
+    &paged_attention_v1,
+    "Compute the attention between an input query and the cached keys/values using PagedAttention.");
+  ops.def(
+    "paged_attention_v2",
+    &paged_attention_v2,
+    "PagedAttention V2.");
+  // Activation ops
+  ops.def(
+    "silu_and_mul",
+    &silu_and_mul,
+    "Activation function used in SwiGLU.");
+  ops.def(
+    "gelu_and_mul",
+    &gelu_and_mul,
+    "Activation function used in GeGLU with `none` approximation.");
+  ops.def(
+    "gelu_tanh_and_mul",
+    &gelu_tanh_and_mul,
+    "Activation function used in GeGLU with `tanh` approximation.");
+  ops.def(
+    "gelu_new",
+    &gelu_new,
+    "GELU implementation used in GPT-2.");
+  ops.def(
+    "gelu_fast",
+    &gelu_fast,
+    "Approximate GELU implementation.");
+  // Layernorm
+  ops.def(
+    "rms_norm",
+    &rms_norm,
+    "Apply Root Mean Square (RMS) Normalization to the input tensor.");
+  ops.def(
+    "fused_add_rms_norm",
+    &fused_add_rms_norm,
+    "In-place fused Add and RMS Normalization");
+  // Rotary embedding
+  ops.def(
+    "rotary_embedding",
+    &rotary_embedding,
+    "Apply GPT-NeoX or GPT-J style rotary embedding to query and key");
+  // Cache ops
+  pybind11::module cache_ops = m.def_submodule("cache_ops", "vLLM cache ops");
+  cache_ops.def(
+    "swap_blocks",
+    &swap_blocks,
+    "Swap in (out) the cache blocks from src to dst");
+  cache_ops.def(
+    "copy_blocks",
+    &copy_blocks,
+    "Copy the cache blocks from src to dst");
+  cache_ops.def(
+    "reshape_and_cache",
+    &reshape_and_cache,
+    "Reshape the key and value tensors and cache them");
+}
--- a/csrc/layernorm_kernels.cu
+++ b/csrc/layernorm_kernels.cu
@@ -4,6 +4,16 @@
 #include "dispatch_utils.h"
 #include "reduction_utils.cuh"
+#ifndef USE_ROCM
+  #include <cuda_bf16.h>
+  #include <cuda_fp16.h>
+#else
+  #include <hip/hip_bf16.h>
+  #include <hip/hip_fp16.h>
+  using __nv_bfloat16 = __hip_bfloat16;
+  using __nv_bfloat162 = __hip_bfloat162;
+#endif
 namespace vllm {
@@ -35,9 +45,201 @@ __global__ void rms_norm_kernel(
  }
 }
-// TODO: Further optimize this kernel.
-template<typename scalar_t>
+/* Converter structs for the conversion from torch types to HIP/CUDA types,
-__global__ void fused_add_rms_norm_kernel(
+   and the associated type conversions within HIP/CUDA. These helpers need
+   to be implemented for now because the relevant type conversion
+   operators/constructors are not consistently implemented by HIP/CUDA, so
+   a generic conversion via type casts cannot be implemented.
+   Each struct should have the member static constexpr bool `exists`:
+   If false, the optimized kernel is not used for the corresponding torch type.
+   If true, the struct should be fully defined as shown in the examples below. 
+ */
+template<typename torch_type>
+struct _typeConvert { static constexpr bool exists = false; };
+#if defined(USE_ROCM) || (defined(CUDA_VERSION) && (CUDA_VERSION >= 12000))
+// CUDA < 12.0 runs into issues with packed type conversion
+template<>
+struct _typeConvert<c10::Half> {
+  static constexpr bool exists = true;
+  using hip_type = __half;
+  using packed_hip_type = __half2;
+  __device__ static inline float convert(hip_type x) { return __half2float(x); }
+  __device__ static inline float2 convert(packed_hip_type x) { return __half22float2(x); }
+  __device__ static inline hip_type convert(float x) { return __float2half_rn(x); }
+  __device__ static inline packed_hip_type convert(float2 x) { return __float22half2_rn(x); }
+};
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
+// CUDA_ARCH < 800 does not have BF16 support
+// TODO: Add in ROCm support once public headers handle bf16 maturely
+template<>
+struct _typeConvert<c10::BFloat16> {
+  static constexpr bool exists = true;
+  using hip_type = __nv_bfloat16;
+  using packed_hip_type = __nv_bfloat162;
+  __device__ static inline float convert(hip_type x) { return __bfloat162float(x); }
+  __device__ static inline float2 convert(packed_hip_type x) { return __bfloat1622float2(x); }
+  __device__ static inline hip_type convert(float x) { return __float2bfloat16(x); }
+  __device__ static inline packed_hip_type convert(float2 x) { return __float22bfloat162_rn(x); }
+};
+#endif // defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
+#endif // defined(USE_ROCM) || (defined(CUDA_VERSION) && (CUDA_VERSION >= 12000))
+/* Vector POD struct to generate vectorized and packed FP16/BF16 ops
+   for appropriate specializations of fused_add_rms_norm_kernel.
+   Only functions that are necessary in that kernel are implemented.
+   Alignment to 16 bytes is required to use 128-bit global memory ops.
+ */
+template<typename scalar_t, int width>
+struct alignas(16) _f16Vec {
+  /* Not theoretically necessary that width is a power of 2 but should 
+     almost always be the case for optimization purposes */ 
+  static_assert(width > 0 && (width & (width - 1)) == 0,
+                "Width is not a positive power of 2!");
+  using Converter = _typeConvert<scalar_t>;
+  using T1 = typename Converter::hip_type;
+  using T2 = typename Converter::packed_hip_type;
+  T1 data[width];
+  __device__ _f16Vec& operator+=(const _f16Vec<scalar_t, width>& other) {
+    if constexpr (width % 2 == 0) {
+      #pragma unroll
+      for (int i = 0; i < width; i += 2) {
+        T2 temp{data[i], data[i+1]};
+        temp += T2{other.data[i], other.data[i+1]};
+        data[i] = temp.x;
+        data[i+1] = temp.y;
+      }
+    } else {
+      #pragma unroll
+      for (int i = 0; i < width; ++i)
+        data[i] += other.data[i];
+    }
+    return *this;
+  }
+  __device__ _f16Vec& operator*=(const _f16Vec<scalar_t, width>& other) {
+    if constexpr (width % 2 == 0) {
+      #pragma unroll
+      for (int i = 0; i < width; i += 2) {
+        T2 temp{data[i], data[i+1]};
+        temp *= T2{other.data[i], other.data[i+1]};
+        data[i] = temp.x;
+        data[i+1] = temp.y;
+      }
+    } else {
+      #pragma unroll
+      for (int i = 0; i < width; ++i)
+        data[i] *= other.data[i];
+    }
+    return *this;
+  }
+  __device__ _f16Vec& operator*=(const float scale) {
+    if constexpr (width % 2 == 0) {
+      #pragma unroll
+      for (int i = 0; i < width; i += 2) {
+        float2 temp_f = Converter::convert(T2{data[i], data[i+1]});
+        temp_f.x *= scale;
+        temp_f.y *= scale;
+        T2 temp = Converter::convert(temp_f);
+        data[i] = temp.x;
+        data[i+1] = temp.y;
+      }
+    } else {
+      #pragma unroll
+      for (int i = 0; i < width; ++i) {
+        float temp = Converter::convert(data[i]) * scale;
+        data[i] = Converter::convert(temp);
+      }
+    }
+    return *this;
+  }
+  __device__ float sum_squares() const {
+    float result = 0.0f;
+    if constexpr (width % 2 == 0) {
+      #pragma unroll
+      for (int i = 0; i < width; i += 2) {
+        float2 z = Converter::convert(T2{data[i], data[i+1]});
+        result += z.x * z.x + z.y * z.y;
+      }
+    } else {
+      #pragma unroll
+      for (int i = 0; i < width; ++i) {
+        float x = Converter::convert(data[i]);
+        result += x * x;
+      }
+    }
+    return result;
+  }
+};
+/* Function specialization in the case of FP16/BF16 tensors.
+   Additional optimizations we can make in this case are
+   packed and vectorized operations, which help with the
+   memory latency bottleneck. */
+template<typename scalar_t, int width>
+__global__ std::enable_if_t<
+  (width > 0) && _typeConvert<scalar_t>::exists> fused_add_rms_norm_kernel(
+  scalar_t* __restrict__ input,           // [..., hidden_size]
+  scalar_t* __restrict__ residual,        // [..., hidden_size]
+  const scalar_t* __restrict__ weight,    // [hidden_size]
+  const float epsilon,
+  const int num_tokens,
+  const int hidden_size) {
+  // Sanity checks on our vector struct and type-punned pointer arithmetic
+  static_assert(std::is_pod_v<_f16Vec<scalar_t, width>>);
+  static_assert(sizeof(_f16Vec<scalar_t, width>) == sizeof(scalar_t) * width);
+  const int vec_hidden_size = hidden_size / width;
+  __shared__ float s_variance;
+  float variance = 0.0f;
+  /* These and the argument pointers are all declared `restrict` as they are
+     not aliased in practice. Argument pointers should not be dereferenced
+     in this kernel as that would be undefined behavior */
+  auto* __restrict__ input_v = reinterpret_cast<_f16Vec<scalar_t, width>*>(input);
+  auto* __restrict__ residual_v = reinterpret_cast<_f16Vec<scalar_t, width>*>(residual);
+  auto* __restrict__ weight_v = reinterpret_cast<const _f16Vec<scalar_t, width>*>(weight);
+  for (int idx = threadIdx.x; idx < vec_hidden_size; idx += blockDim.x) {
+    int id = blockIdx.x * vec_hidden_size + idx;
+    _f16Vec<scalar_t, width> temp = input_v[id];
+    temp += residual_v[id];
+    variance += temp.sum_squares();
+    residual_v[id] = temp;
+  }
+  /* Keep the following if-else block in sync with the
+     calculation of max_block_size in fused_add_rms_norm */ 
+  if (num_tokens < 256) {
+    variance = blockReduceSum<float, 1024>(variance);
+  } else variance = blockReduceSum<float, 256>(variance);
+  if (threadIdx.x == 0) {
+    s_variance = rsqrtf(variance / hidden_size + epsilon);
+  }
+  __syncthreads();
+  for (int idx = threadIdx.x; idx < vec_hidden_size; idx += blockDim.x) {
+    int id = blockIdx.x * vec_hidden_size + idx;
+    _f16Vec<scalar_t, width> temp = residual_v[id];
+    temp *= s_variance;
+    temp *= weight_v[idx];
+    input_v[id] = temp;
+  }
+}
+/* Generic fused_add_rms_norm_kernel
+   The width field is not used here but necessary for other specializations.
+ */
+template<typename scalar_t, int width>
+__global__ std::enable_if_t<
+  (width == 0) || !_typeConvert<scalar_t>::exists> fused_add_rms_norm_kernel(
  scalar_t* __restrict__ input,           // [..., hidden_size]
  scalar_t* __restrict__ residual,        // [..., hidden_size]
  const scalar_t* __restrict__ weight,    // [hidden_size]
@@ -48,12 +250,17 @@ __global__ void fused_add_rms_norm_kernel(
  float variance = 0.0f;
  for (int idx = threadIdx.x; idx < hidden_size; idx += blockDim.x) {
-    float x = (float) input[blockIdx.x * hidden_size + idx];
+    scalar_t z = input[blockIdx.x * hidden_size + idx];
-    x += (float) residual[blockIdx.x * hidden_size + idx];
+    z += residual[blockIdx.x * hidden_size + idx];
+    float x = (float) z;
    variance += x * x;
-    residual[blockIdx.x * hidden_size + idx] = (scalar_t) x;
+    residual[blockIdx.x * hidden_size + idx] = z;
  }
-  variance = blockReduceSum<float>(variance);
+  /* Keep the following if-else block in sync with the
+     calculation of max_block_size in fused_add_rms_norm */ 
+  if (num_tokens < 256) {
+    variance = blockReduceSum<float, 1024>(variance);
+  } else variance = blockReduceSum<float, 256>(variance);
  if (threadIdx.x == 0) {
    s_variance = rsqrtf(variance / hidden_size + epsilon);
  }
@@ -93,6 +300,21 @@ void rms_norm(
    });
 }
+#define LAUNCH_FUSED_ADD_RMS_NORM(width)              \
+  VLLM_DISPATCH_FLOATING_TYPES(                       \
+    input.scalar_type(),                              \
+    "fused_add_rms_norm_kernel",                      \
+    [&] {                                             \
+      vllm::fused_add_rms_norm_kernel                 \
+      <scalar_t, width><<<grid, block, 0, stream>>>(  \
+        input.data_ptr<scalar_t>(),                   \
+        residual.data_ptr<scalar_t>(),                \
+        weight.data_ptr<scalar_t>(),                  \
+        epsilon,                                      \
+        num_tokens,                                   \
+        hidden_size);                                 \
+    });
 void fused_add_rms_norm(
  torch::Tensor& input,    // [..., hidden_size]
  torch::Tensor& residual, // [..., hidden_size]
@@ -102,19 +324,29 @@ void fused_add_rms_norm(
  int num_tokens = input.numel() / hidden_size;
  dim3 grid(num_tokens);
-  dim3 block(std::min(hidden_size, 1024));
+  /* This kernel is memory-latency bound in many scenarios.
+     When num_tokens is large, a smaller block size allows
+     for increased block occupancy on CUs and better latency
+     hiding on global mem ops. */
+  const int max_block_size = (num_tokens < 256) ? 1024 : 256;
+  dim3 block(std::min(hidden_size, max_block_size));
  const at::cuda::OptionalCUDAGuard device_guard(device_of(input));
  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
-  VLLM_DISPATCH_FLOATING_TYPES(
+  /*If the tensor types are FP16/BF16, try to use the optimized kernel
-    input.scalar_type(),
+    with packed + vectorized ops.
-    "fused_add_rms_norm_kernel",
+    Max optimization is achieved with a width-8 vector of FP16/BF16s
-    [&] {
+    since we can load at most 128 bits at once in a global memory op.
-      vllm::fused_add_rms_norm_kernel<scalar_t><<<grid, block, 0, stream>>>(
+    However, this requires each tensor's data to be aligned to 16
-        input.data_ptr<scalar_t>(),
+    bytes.
-        residual.data_ptr<scalar_t>(),
+   */
-        weight.data_ptr<scalar_t>(),
+  auto inp_ptr = reinterpret_cast<std::uintptr_t>(input.data_ptr());
-        epsilon,
+  auto res_ptr = reinterpret_cast<std::uintptr_t>(residual.data_ptr());
-        num_tokens,
+  auto wt_ptr = reinterpret_cast<std::uintptr_t>(weight.data_ptr());
-        hidden_size);
+  bool ptrs_are_aligned = inp_ptr % 16 == 0 && res_ptr % 16 == 0 \
-    });
+                          && wt_ptr % 16 == 0;
+  if (ptrs_are_aligned && hidden_size % 8 == 0) {
+    LAUNCH_FUSED_ADD_RMS_NORM(8);
+  } else {
+    LAUNCH_FUSED_ADD_RMS_NORM(0);
+  }
 }
--- a/csrc/ops.h
+++ b/csrc/ops.h
@@ -14,7 +14,8 @@ void paged_attention_v1(
  int block_size,
  int max_context_len,
  const c10::optional<torch::Tensor>& alibi_slopes,
-  const std::string& kv_cache_dtype);
+  const std::string& kv_cache_dtype,
+  float kv_scale);
 void paged_attention_v2(
  torch::Tensor& out,
@@ -31,7 +32,8 @@ void paged_attention_v2(
  int block_size,
  int max_context_len,
  const c10::optional<torch::Tensor>& alibi_slopes,
-  const std::string& kv_cache_dtype);
+  const std::string& kv_cache_dtype,
+  float kv_scale);
 void rms_norm(
  torch::Tensor& out,
@@ -84,6 +86,21 @@ void gelu_fast(
  torch::Tensor& input);
 #ifndef USE_ROCM
+torch::Tensor aqlm_gemm(
+  const torch::Tensor& input,
+  const torch::Tensor& codes,
+  const torch::Tensor& codebooks,
+  const torch::Tensor& scales,
+  const torch::Tensor& codebook_partition_sizes,
+  const std::optional<torch::Tensor>& bias
+);
+torch::Tensor aqlm_dequant(
+  const torch::Tensor& codes,
+  const torch::Tensor& codebooks,
+  const torch::Tensor& codebook_partition_sizes
+);
 torch::Tensor awq_gemm(
  torch::Tensor _in_feats,
  torch::Tensor _kernel,
@@ -129,6 +146,11 @@ void gptq_shuffle(
  torch::Tensor q_perm,
  int bit);
+void scaled_fp8_quant(
+  torch::Tensor& out,
+  torch::Tensor& input,
+  torch::Tensor& scale);
 void moe_align_block_size(
  torch::Tensor topk_ids,
  int num_experts,

--- a/csrc/punica/bgmv/bgmv_bf16_bf16_fp16.cu
+++ b/csrc/punica/bgmv/bgmv_bf16_bf16_fp16.cu
-#include "bgmv_config.h"
-#include "bgmv_impl.cuh"
-FOR_BGMV_WIDE_NARROW(INST_BGMV_TWOSIDE, nv_bfloat16, nv_bfloat16, nv_half)
--- a/csrc/punica/bgmv/bgmv_bf16_fp16_bf16.cu
+++ b/csrc/punica/bgmv/bgmv_bf16_fp16_bf16.cu
-#include "bgmv_config.h"
-#include "bgmv_impl.cuh"
-FOR_BGMV_WIDE_NARROW(INST_BGMV_TWOSIDE, nv_bfloat16, nv_half, nv_bfloat16)
--- a/csrc/punica/bgmv/bgmv_bf16_fp16_fp16.cu
+++ b/csrc/punica/bgmv/bgmv_bf16_fp16_fp16.cu
-#include "bgmv_config.h"
-#include "bgmv_impl.cuh"
-FOR_BGMV_WIDE_NARROW(INST_BGMV_TWOSIDE, nv_bfloat16, nv_half, nv_half)
--- a/csrc/punica/bgmv/bgmv_bf16_fp32_fp16.cu
+++ b/csrc/punica/bgmv/bgmv_bf16_fp32_fp16.cu
-#include "bgmv_config.h"
-#include "bgmv_impl.cuh"
-FOR_BGMV_WIDE_NARROW(INST_BGMV_TWOSIDE, nv_bfloat16, float, nv_half)
--- a/csrc/punica/bgmv/bgmv_config.h
+++ b/csrc/punica/bgmv/bgmv_config.h
@@ -14,6 +14,7 @@ void bgmv_kernel(out_T *__restrict__ Y, const in_T *__restrict__ X,
    f(in_T, out_T, W_T, narrow, 128) \
    f(in_T, out_T, W_T, narrow, 256) \
    f(in_T, out_T, W_T, narrow, 512) \
+    f(in_T, out_T, W_T, narrow, 640) \
    f(in_T, out_T, W_T, narrow, 768) \
    f(in_T, out_T, W_T, narrow, 1024) \
    f(in_T, out_T, W_T, narrow, 1152) \
@@ -46,6 +47,7 @@ void bgmv_kernel(out_T *__restrict__ Y, const in_T *__restrict__ X,
    f(in_T, out_T, W_T, narrow, 13696) \
    f(in_T, out_T, W_T, narrow, 13824) \
    f(in_T, out_T, W_T, narrow, 14336) \
+    f(in_T, out_T, W_T, narrow, 15360) \
    f(in_T, out_T, W_T, narrow, 16384) \
    f(in_T, out_T, W_T, narrow, 20480) \
    f(in_T, out_T, W_T, narrow, 22016) \
@@ -58,8 +60,19 @@ void bgmv_kernel(out_T *__restrict__ Y, const in_T *__restrict__ X,
    f(in_T, out_T, W_T, narrow, 32768) \
    f(in_T, out_T, W_T, narrow, 33024) \
    f(in_T, out_T, W_T, narrow, 36864) \
+    f(in_T, out_T, W_T, narrow, 43264) \
    f(in_T, out_T, W_T, narrow, 49152) \
-// Keep above in sync with vllm/lora/layers::SamplerWithLoRA
+    f(in_T, out_T, W_T, narrow, 64000) \
+    f(in_T, out_T, W_T, narrow, 64256) \
+    f(in_T, out_T, W_T, narrow, 64512) \
+    f(in_T, out_T, W_T, narrow, 102400) \
+    f(in_T, out_T, W_T, narrow, 102656) \
+    f(in_T, out_T, W_T, narrow, 102912) \
+    f(in_T, out_T, W_T, narrow, 128000) \
+    f(in_T, out_T, W_T, narrow, 128256) \
+    f(in_T, out_T, W_T, narrow, 128512) \
+// Keep above in sync with vllm/lora/layers::LogitsProcessorWithLoRA
+// and vllm/tests/lora/test_punica.py
 // Keep this in sync with vllm/config::LoRAConfig
 #define FOR_BGMV_WIDE_NARROW(f, in_T, out_T, W_T) \

--- a/csrc/punica/bgmv/bgmv_fp16_bf16_bf16.cu
+++ b/csrc/punica/bgmv/bgmv_fp16_bf16_bf16.cu
-#include "bgmv_config.h"
-#include "bgmv_impl.cuh"
-FOR_BGMV_WIDE_NARROW(INST_BGMV_TWOSIDE, nv_half, nv_bfloat16, nv_bfloat16)
--- a/csrc/punica/bgmv/bgmv_fp16_bf16_fp16.cu
+++ b/csrc/punica/bgmv/bgmv_fp16_bf16_fp16.cu
-#include "bgmv_config.h"
-#include "bgmv_impl.cuh"
-FOR_BGMV_WIDE_NARROW(INST_BGMV_TWOSIDE, nv_half, nv_bfloat16, nv_half)
--- a/csrc/punica/bgmv/bgmv_fp16_fp16_bf16.cu
+++ b/csrc/punica/bgmv/bgmv_fp16_fp16_bf16.cu
-#include "bgmv_config.h"
-#include "bgmv_impl.cuh"
-FOR_BGMV_WIDE_NARROW(INST_BGMV_TWOSIDE, nv_half, nv_half, nv_bfloat16)
--- a/csrc/punica/bgmv/bgmv_fp16_fp32_bf16.cu
+++ b/csrc/punica/bgmv/bgmv_fp16_fp32_bf16.cu
-#include "bgmv_config.h"
-#include "bgmv_impl.cuh"
-FOR_BGMV_WIDE_NARROW(INST_BGMV_TWOSIDE, nv_half, float, nv_bfloat16)
--- a/csrc/punica/bgmv/bgmv_fp32_bf16_fp16.cu
+++ b/csrc/punica/bgmv/bgmv_fp32_bf16_fp16.cu
-#include "bgmv_config.h"
-#include "bgmv_impl.cuh"
-FOR_BGMV_WIDE_NARROW(INST_BGMV_TWOSIDE, float, nv_bfloat16, nv_half)
--- a/csrc/punica/bgmv/bgmv_fp32_fp16_bf16.cu
+++ b/csrc/punica/bgmv/bgmv_fp32_fp16_bf16.cu
-#include "bgmv_config.h"
-#include "bgmv_impl.cuh"
-FOR_BGMV_WIDE_NARROW(INST_BGMV_TWOSIDE, float, nv_half, nv_bfloat16)
--- a/csrc/punica/bgmv/bgmv_fp32_fp32_bf16.cu
+++ b/csrc/punica/bgmv/bgmv_fp32_fp32_bf16.cu
-#include "bgmv_config.h"
-#include "bgmv_impl.cuh"
-FOR_BGMV_WIDE_NARROW(INST_BGMV_TWOSIDE, float, float, nv_bfloat16)
--- a/csrc/punica/bgmv/bgmv_fp32_fp32_fp16.cu
+++ b/csrc/punica/bgmv/bgmv_fp32_fp32_fp16.cu
-#include "bgmv_config.h"
-#include "bgmv_impl.cuh"
-FOR_BGMV_WIDE_NARROW(INST_BGMV_TWOSIDE, float, float, nv_half)
--- a/csrc/punica/bgmv/generator.py
+++ b/csrc/punica/bgmv/generator.py
@@ -18,6 +18,26 @@ for input_dtype in DTYPES:
            if weight_dtype == "fp32":
                # FP32 weights are not supported.
                continue
+            if output_dtype == "fp32":
+                # LoRA A matrix.
+                if input_dtype != weight_dtype:
+                    # NOTE(woosuk): While Punica supports the case where the
+                    # input and weight dtypes are different, we only generate
+                    # the kernels the same dtypes to reduce the binary size.
+                    continue
+            elif input_dtype == "fp32":
+                # LoRA B matrix.
+                if output_dtype != weight_dtype:
+                    # NOTE(woosuk): While Punica supports the case where the
+                    # output and weight dtypes are different, we only generate
+                    # the kernels the same dtypes to reduce the binary size.
+                    continue
+            elif not (input_dtype == output_dtype == weight_dtype):
+                # NOTE(woosuk): While Punica supports mixed data types for
+                # input, output, and weight, we only generate the kernels with
+                # the same data types to reduce the binary size.
+                continue
            kernel_definition = TEMPLATE.format(
                input_dtype=DTYPE_MAP[input_dtype],
                output_dtype=DTYPE_MAP[output_dtype],