Merge branch '0.5.0_blocksize_8_32' into 'v0.5.0-dtk24.04.1'

恢复blocksize8和32支持 See merge request dcutoolkit/deeplearing/vllm!8

Merge branch '0.5.0_blocksize_8_32' into 'v0.5.0-dtk24.04.1'
恢复blocksize8和32支持 See merge request dcutoolkit/deeplearing/vllm!8
9807bb0f · zhuwenwen · 268fc6a2 · f99a8d1c · 9807bb0f
Commit 9807bb0f authored Aug 13, 2024 by zhuwenwen
Hide whitespace changes
Inline Side-by-side

Showing with 12 additions and 20 deletions

csrc/attention/attention_kernels.cu csrc/attention/attention_kernels.cu +12 -20

No files found.
--- a/csrc/attention/attention_kernels.cu
+++ b/csrc/attention/attention_kernels.cu
@@ -879,24 +879,20 @@ void paged_attention_v1_launcher(
 // 1, 2, 4, 64, 128, 256.
 #define CALL_V1_LAUNCHER_BLOCK_SIZE(T, CACHE_T, KV_DTYPE)         \
  switch (block_size) {                                           \
+    case 8:                                                       \
+      CALL_V1_LAUNCHER_SPARSITY(T, CACHE_T, 8, KV_DTYPE);         \
+      break;                                                      \
    case 16:                                                      \
      CALL_V1_LAUNCHER_SPARSITY(T, CACHE_T, 16, KV_DTYPE);        \
      break;                                                      \
+    case 32:                                                      \
+      CALL_V1_LAUNCHER_SPARSITY(T, CACHE_T, 32, KV_DTYPE);        \
+      break;                                                      \
    default:                                                      \
      TORCH_CHECK(false, "Unsupported block size: ", block_size); \
      break;                                                      \
  }

-// // NOTE(woosuk): To reduce the compilation time, we omitted block sizes
-// // 1, 2, 4, 64, 128, 256.
-// #define CALL_V1_LAUNCHER_BLOCK_SIZE(T, CACHE_T, KV_DTYPE)         \
-//   switch (block_size) {                                           \
-//     case 16:                                                      \
-//       CALL_V1_LAUNCHER_SPARSITY(T, CACHE_T, 16, KV_DTYPE);        \
-//       break;                                                      \
-//       TORCH_CHECK(false, "Unsupported block size: ", block_size); \
-//       break;                                                      \
-//   }

 void paged_attention_v1(
    torch::Tensor& out,    // [num_seqs, num_heads, head_size]
@@ -1030,24 +1026,20 @@ void paged_attention_v2_launcher(
 // 1, 2, 4, 64, 128, 256.
 #define CALL_V2_LAUNCHER_BLOCK_SIZE(T, CACHE_T, KV_DTYPE)         \
  switch (block_size) {                                           \
+    case 8:                                                       \
+      CALL_V2_LAUNCHER_SPARSITY(T, CACHE_T, 8, KV_DTYPE);         \
+      break;                                                      \
    case 16:                                                      \
      CALL_V2_LAUNCHER_SPARSITY(T, CACHE_T, 16, KV_DTYPE);        \
      break;                                                      \
+    case 32:                                                      \
+      CALL_V2_LAUNCHER_SPARSITY(T, CACHE_T, 32, KV_DTYPE);        \
+      break;                                                      \
    default:                                                      \
      TORCH_CHECK(false, "Unsupported block size: ", block_size); \
      break;                                                      \
  }

-// // NOTE(woosuk): To reduce the compilation time, we omitted block sizes
-// // 1, 2, 4, 64, 128, 256.
-// #define CALL_V2_LAUNCHER_BLOCK_SIZE(T, CACHE_T, KV_DTYPE)         \
-//   switch (block_size) {                                           \
-//     case 16:                                                      \
-//       CALL_V2_LAUNCHER_SPARSITY(T, CACHE_T, 16, KV_DTYPE);        \
-//       break;                                                      \
-//       TORCH_CHECK(false, "Unsupported block size: ", block_size); \
-//       break;                                                      \
-//   }

 void paged_attention_v2(
    torch::Tensor& out,         // [num_seqs, num_heads, head_size]