restore support for block_size 8 and 32

675bceed · zhuwenwen · d231153f · 675bceed
Commit 675bceed authored Aug 13, 2024 by zhuwenwen
Hide whitespace changes
Inline Side-by-side

Showing with 12 additions and 22 deletions

csrc/attention/attention_kernels.cu csrc/attention/attention_kernels.cu +12 -22

No files found.
--- a/csrc/attention/attention_kernels.cu
+++ b/csrc/attention/attention_kernels.cu
@@ -885,25 +885,20 @@ void paged_attention_v1_launcher(
 // 1, 2, 4, 64, 128, 256.
 #define CALL_V1_LAUNCHER_BLOCK_SIZE(T, CACHE_T, KV_DTYPE)         \
  switch (block_size) {                                           \
+    case 8:                                                       \
+      CALL_V1_LAUNCHER_SPARSITY(T, CACHE_T, 8, KV_DTYPE);         \
+      break;                                                      \
    case 16:                                                      \
      CALL_V1_LAUNCHER_SPARSITY(T, CACHE_T, 16, KV_DTYPE);        \
      break;                                                      \
+    case 32:                                                      \
+      CALL_V1_LAUNCHER_SPARSITY(T, CACHE_T, 32, KV_DTYPE);        \
+      break;                                                      \
    default:                                                      \
      TORCH_CHECK(false, "Unsupported block size: ", block_size); \
      break;                                                      \
  }
-// // NOTE(woosuk): To reduce the compilation time, we omitted block sizes
-// // 1, 2, 4, 64, 128, 256.
-// #define CALL_V1_LAUNCHER_BLOCK_SIZE(T, CACHE_T, KV_DTYPE)         \
-//   switch (block_size) {                                           \
-//     case 16:                                                      \
-//       CALL_V1_LAUNCHER_SPARSITY(T, CACHE_T, 16, KV_DTYPE);        \
-//       break;                                                      \
-//       TORCH_CHECK(false, "Unsupported block size: ", block_size); \
-//       break;                                                      \
-//   }
 void paged_attention_v1(
    torch::Tensor& out,    // [num_seqs, num_heads, head_size]
    torch::Tensor& query,  // [num_seqs, num_heads, head_size]
@@ -1037,25 +1032,20 @@ void paged_attention_v2_launcher(
 // 1, 2, 4, 64, 128, 256.
 #define CALL_V2_LAUNCHER_BLOCK_SIZE(T, CACHE_T, KV_DTYPE)         \
  switch (block_size) {                                           \
+    case 8:                                                       \
+      CALL_V2_LAUNCHER_SPARSITY(T, CACHE_T, 8, KV_DTYPE);         \
+      break;                                                      \
    case 16:                                                      \
      CALL_V2_LAUNCHER_SPARSITY(T, CACHE_T, 16, KV_DTYPE);        \
      break;                                                      \
+    case 32:                                                      \
+      CALL_V2_LAUNCHER_SPARSITY(T, CACHE_T, 32, KV_DTYPE);        \
+      break;                                                      \
    default:                                                      \
      TORCH_CHECK(false, "Unsupported block size: ", block_size); \
      break;                                                      \
  }
-// // NOTE(woosuk): To reduce the compilation time, we omitted block sizes
-// // 1, 2, 4, 64, 128, 256.
-// #define CALL_V2_LAUNCHER_BLOCK_SIZE(T, CACHE_T, KV_DTYPE)         \
-//   switch (block_size) {                                           \
-//     case 16:                                                      \
-//       CALL_V2_LAUNCHER_SPARSITY(T, CACHE_T, 16, KV_DTYPE);        \
-//       break;                                                      \
-//       TORCH_CHECK(false, "Unsupported block size: ", block_size); \
-//       break;                                                      \
-//   }
 void paged_attention_v2(
    torch::Tensor& out,         // [num_seqs, num_heads, head_size]
    torch::Tensor& exp_sums,    // [num_seqs, num_heads, max_num_partitions]