Merge tag 'v0.7.1' into v0.7.1-dev

afd0da21 · zhuwenwen · 1a11f127 · 4f4d427a · afd0da21 · afd0da21
Commit afd0da21 authored Feb 03, 2025 by zhuwenwen
20 changed files
--- a/csrc/activation_kernels.cu
+++ b/csrc/activation_kernels.cu
--- a/csrc/attention/attention_kernels.cuh
+++ b/csrc/attention/attention_kernels.cuh
@@ -105,7 +105,7 @@ __device__ void paged_attention_kernel(
    const int max_num_blocks_per_seq,
    const float* __restrict__ alibi_slopes,  // [num_heads]
    const int q_stride, const int kv_block_stride, const int kv_head_stride,
-    const float k_scale, const float v_scale, const int tp_rank,
+    const float* k_scale, const float* v_scale, const int tp_rank,
    const int blocksparse_local_blocks, const int blocksparse_vert_stride,
    const int blocksparse_block_size, const int blocksparse_head_sliding_step) {
  const int seq_idx = blockIdx.y;
@@ -285,7 +285,7 @@ __device__ void paged_attention_kernel(
          Quant_vec k_vec_quant = *reinterpret_cast<const Quant_vec*>(
              k_ptr + offset1 * BLOCK_SIZE * x + offset2);
          k_vecs[j] = fp8::scaled_convert<K_vec, Quant_vec, KV_DTYPE>(
-              k_vec_quant, k_scale);
+              k_vec_quant, *k_scale);
        }
      }
@@ -415,7 +415,7 @@ __device__ void paged_attention_kernel(
              *reinterpret_cast<const V_quant_vec*>(v_ptr + offset);
          // Vector conversion from V_quant_vec to V_vec.
          v_vec = fp8::scaled_convert<V_vec, V_quant_vec, KV_DTYPE>(v_quant_vec,
-                                                                    v_scale);
+                                                                    *v_scale);
        }
        if (block_idx == num_seq_blocks - 1) {
          // NOTE(woosuk): When v_vec contains the tokens that are out of the
@@ -513,7 +513,7 @@ __global__ void paged_attention_v1_kernel(
    const int max_num_blocks_per_seq,
    const float* __restrict__ alibi_slopes,  // [num_heads]
    const int q_stride, const int kv_block_stride, const int kv_head_stride,
-    const float k_scale, const float v_scale, const int tp_rank,
+    const float* k_scale, const float* v_scale, const int tp_rank,
    const int blocksparse_local_blocks, const int blocksparse_vert_stride,
    const int blocksparse_block_size, const int blocksparse_head_sliding_step) {
  paged_attention_kernel<scalar_t, cache_t, HEAD_SIZE, BLOCK_SIZE, NUM_THREADS,
@@ -549,7 +549,7 @@ __global__ void paged_attention_v2_kernel(
    const int max_num_blocks_per_seq,
    const float* __restrict__ alibi_slopes,  // [num_heads]
    const int q_stride, const int kv_block_stride, const int kv_head_stride,
-    const float k_scale, const float v_scale, const int tp_rank,
+    const float* k_scale, const float* v_scale, const int tp_rank,
    const int blocksparse_local_blocks, const int blocksparse_vert_stride,
    const int blocksparse_block_size, const int blocksparse_head_sliding_step) {
  paged_attention_kernel<scalar_t, cache_t, HEAD_SIZE, BLOCK_SIZE, NUM_THREADS,

--- a/csrc/attention/paged_attention_v1.cu
+++ b/csrc/attention/paged_attention_v1.cu
--- a/csrc/attention/paged_attention_v2.cu
+++ b/csrc/attention/paged_attention_v2.cu
--- a/csrc/cache.h
+++ b/csrc/cache.h
--- a/csrc/cache_kernels.cu
+++ b/csrc/cache_kernels.cu
--- a/csrc/core/math.hpp
+++ b/csrc/core/math.hpp
--- a/csrc/core/scalar_type.hpp
+++ b/csrc/core/scalar_type.hpp
--- a/csrc/cpu/attention.cpp
+++ b/csrc/cpu/attention.cpp
--- a/csrc/cpu/cache.cpp
+++ b/csrc/cpu/cache.cpp
--- a/csrc/cpu/cpu_types.hpp
+++ b/csrc/cpu/cpu_types.hpp
--- a/csrc/cpu/cpu_types_arm.hpp
+++ b/csrc/cpu/cpu_types_arm.hpp
--- a/csrc/cpu/cpu_types_vsx.hpp
+++ b/csrc/cpu/cpu_types_vsx.hpp
--- a/csrc/cpu/cpu_types_x86.hpp
+++ b/csrc/cpu/cpu_types_x86.hpp
--- a/csrc/cpu/quant.cpp
+++ b/csrc/cpu/quant.cpp
--- a/csrc/cpu/torch_bindings.cpp
+++ b/csrc/cpu/torch_bindings.cpp
--- a/csrc/cpu/utils.cpp
+++ b/csrc/cpu/utils.cpp
--- a/csrc/cumem_allocator.cpp
+++ b/csrc/cumem_allocator.cpp
--- a/csrc/custom_all_reduce.cuh
+++ b/csrc/custom_all_reduce.cuh
--- a/csrc/cutlass_extensions/common.hpp
+++ b/csrc/cutlass_extensions/common.hpp