[MISC] Remove unused variableds in C++ (#19609)

Signed-off-by: Lu Fang <lufang@fb.com>

[MISC] Remove unused variableds in C++ (#19609)
Signed-off-by: Lu Fang <lufang@fb.com>
c6703d1e · Lu Fang · GitHub · a5e7242d · c6703d1e · c6703d1e
Unverified Commit c6703d1e authored Jun 16, 2025 by Lu Fang Committed by GitHub Jun 15, 2025
6 changed files
--- a/csrc/attention/paged_attention_v1.cu
+++ b/csrc/attention/paged_attention_v1.cu
@@ -65,9 +65,6 @@ void paged_attention_v1_launcher(
  int kv_block_stride = key_cache.stride(0);
  int kv_head_stride = key_cache.stride(1);

-  [[maybe_unused]] int thread_group_size = MAX(WARP_SIZE / BLOCK_SIZE, 1);
-  assert(head_size % thread_group_size == 0);
-
  // NOTE: alibi_slopes is optional.
  const float* alibi_slopes_ptr =
      alibi_slopes
@@ -193,4 +190,4 @@ void paged_attention_v1(
 #undef WARP_SIZE
 #undef MAX
 #undef MIN
-#undef DIVIDE_ROUND_UP
\ No newline at end of file
+#undef DIVIDE_ROUND_UP
--- a/csrc/attention/paged_attention_v2.cu
+++ b/csrc/attention/paged_attention_v2.cu
@@ -66,9 +66,6 @@ void paged_attention_v2_launcher(
  int kv_block_stride = key_cache.stride(0);
  int kv_head_stride = key_cache.stride(1);

-  [[maybe_unused]] int thread_group_size = MAX(WARP_SIZE / BLOCK_SIZE, 1);
-  assert(head_size % thread_group_size == 0);
-
  // NOTE: alibi_slopes is optional.
  const float* alibi_slopes_ptr =
      alibi_slopes
@@ -203,4 +200,4 @@ void paged_attention_v2(
 #undef WARP_SIZE
 #undef MAX
 #undef MIN
-#undef DIVIDE_ROUND_UP
\ No newline at end of file
+#undef DIVIDE_ROUND_UP
--- a/csrc/prepare_inputs/advance_step.cu
+++ b/csrc/prepare_inputs/advance_step.cu
@@ -274,7 +274,6 @@ void advance_step_flashinfer(
  cudaDeviceGetAttribute(&blocks, cudaDevAttrMultiProcessorCount, dev);
  cudaDeviceGetAttribute(&threads, cudaDevAttrMaxThreadsPerBlock, dev);

-  [[maybe_unused]] int block_tables_stride = block_tables.stride(0);
  TORCH_CHECK((blocks * threads > num_queries),
              "multi-step: not enough threads to map to num_queries = ",
              num_queries, " block_tables.stride(0) = ", block_tables.stride(0),

--- a/csrc/quantization/fp8/amd/quant_utils.cuh
+++ b/csrc/quantization/fp8/amd/quant_utils.cuh
@@ -446,8 +446,6 @@ scaled_vec_conversion<uint16_t, uint8_t>(const uint8_t& a, float scale) {
 template <>
 __inline__ __device__ uint32_t
 scaled_vec_conversion<uint32_t, uint16_t>(const uint16_t& a, float scale) {
-  [[maybe_unused]] __half2_raw h2r =
-      __hip_cvt_fp8x2_to_halfraw2(a, fp8_type::__default_interpret);
  union {
    __half2_raw h2r;
    uint32_t ui32;

--- a/csrc/quantization/gptq/q_gemm.cu
+++ b/csrc/quantization/gptq/q_gemm.cu
@@ -206,8 +206,6 @@ __global__ void gemm_half_q_half_gptq_4bit_kernel(
  auto offset_m = blockIdx.y * m_count;
  auto offset_k = blockIdx.z * BLOCK_KN_SIZE;

-  [[maybe_unused]] int end_n = min(offset_n + BLOCK_KN_SIZE * 4, size_n);
-  [[maybe_unused]] int end_m = min(offset_m + m_count, size_m);
  int end_k = min(offset_k + BLOCK_KN_SIZE, size_k);

  int n = offset_n + t * 4;
@@ -344,8 +342,6 @@ __global__ void gemm_half_q_half_gptq_2bit_kernel(
  auto offset_m = blockIdx.y * m_count;
  auto offset_k = blockIdx.z * BLOCK_KN_SIZE;

-  [[maybe_unused]] int end_n = min(offset_n + BLOCK_KN_SIZE * 4, size_n);
-  [[maybe_unused]] int end_m = min(offset_m + m_count, size_m);
  int end_k = min(offset_k + BLOCK_KN_SIZE, size_k);

  int n = offset_n + t * 4;
@@ -465,8 +461,6 @@ __global__ void gemm_half_q_half_gptq_3bit_kernel(
  auto offset_m = blockIdx.y * m_count;
  auto offset_k = blockIdx.z * BLOCK_KN_SIZE;

-  [[maybe_unused]] int end_n = min(offset_n + BLOCK_KN_SIZE * 4, size_n);
-  [[maybe_unused]] int end_m = min(offset_m + m_count, size_m);
  int end_k = min(offset_k + BLOCK_KN_SIZE, size_k);

  int n = offset_n + t * 4;
@@ -593,8 +587,6 @@ __global__ void gemm_half_q_half_gptq_8bit_kernel(
  auto offset_m = blockIdx.y * m_count;
  auto offset_k = blockIdx.z * BLOCK_KN_SIZE;

-  [[maybe_unused]] int end_n = min(offset_n + BLOCK_KN_SIZE * 4, size_n);
-  [[maybe_unused]] int end_m = min(offset_m + m_count, size_m);
  int end_k = min(offset_k + BLOCK_KN_SIZE, size_k);

  int n = offset_n + t * 4;

--- a/csrc/rocm/attention.cu
+++ b/csrc/rocm/attention.cu
@@ -136,11 +136,6 @@ __device__ __forceinline__ T from_float(const float& inp) {

 template <typename T>
 __device__ __forceinline__ _B16x4 from_floatx4(const floatx4& inp) {
-  [[maybe_unused]] union tmpcvt {
-    uint16_t u;
-    _Float16 f;
-    __hip_bfloat16 b;
-  } t16;
  _B16x4 ret;
  if constexpr (std::is_same<T, _Float16>::value) {
    union h2cvt {
@@ -169,11 +164,6 @@ __device__ __forceinline__ _B16x4 from_floatx4(const floatx4& inp) {
 template <typename T>
 __device__ __forceinline__ _B16x4 addx4(const _B16x4& inp1,
                                        const _B16x4& inp2) {
-  [[maybe_unused]] union tmpcvt {
-    uint16_t u;
-    _Float16 f;
-    __hip_bfloat16 b;
-  } t1, t2, res;
  _B16x4 ret;
  if constexpr (std::is_same<T, _Float16>::value) {
    union h2cvt {
@@ -325,8 +315,6 @@ __launch_bounds__(NUM_THREADS, 5) void paged_attention_ll4mi_QKV_mfma16_kernel(

  constexpr int GQA_RATIO4 = DIVIDE_ROUND_UP(GQA_RATIO, 4);

-  [[maybe_unused]] __shared__ float shared_qk_max[NWARPS][16 + 1];
-  [[maybe_unused]] __shared__ float shared_exp_sum[NWARPS][16 + 1];
  // shared_logits is used for multiple purposes
  __shared__ _B16x4 shared_logits[NWARPS][4][16][4];

@@ -444,8 +432,6 @@ __launch_bounds__(NUM_THREADS, 5) void paged_attention_ll4mi_QKV_mfma16_kernel(
    const cache_t* k_ptr2 = k_ptr + kblock_number * kv_block_stride;
    const int klocal_token_idx =
        TOKENS_PER_WARP * warpid + token_depth * 16 + lane16id;
-    [[maybe_unused]] const int kglobal_token_idx =
-        partition_start_token_idx + klocal_token_idx;
    const int kphysical_block_offset = klocal_token_idx % BLOCK_SIZE;
    const cache_t* k_ptr3 = k_ptr2 + kphysical_block_offset * KX;

@@ -1309,9 +1295,7 @@ __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_reduce_kernel(

  const int context_len = context_lens[seq_idx];
  const int num_partitions = DIVIDE_ROUND_UP(context_len, PARTITION_SIZE);
-  [[maybe_unused]] constexpr int NUM_WARPS = NUM_THREADS / WARP_SIZE;
  const auto warpid = threadIdx.x / WARP_SIZE;
-  [[maybe_unused]] const auto laneid = threadIdx.x % WARP_SIZE;

  __shared__ float shared_global_exp_sum;
  // max num partitions supported is warp_size * NPAR_LOOPS
@@ -2080,9 +2064,7 @@ __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_reduce_kernel(

  const int context_len = context_lens[seq_idx];
  const int num_partitions = DIVIDE_ROUND_UP(context_len, PARTITION_SIZE);
-  [[maybe_unused]] constexpr int NUM_WARPS = NUM_THREADS / WARP_SIZE;
  const int warpid = threadIdx.x / WARP_SIZE;
-  [[maybe_unused]] const int laneid = threadIdx.x % WARP_SIZE;

  __shared__ float shared_global_exp_sum;
  // max num partitions supported is warp_size * NPAR_LOOPS
@@ -2816,9 +2798,7 @@ __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_reduce_kernel(

  const int context_len = context_lens[seq_idx];
  const int num_partitions = DIVIDE_ROUND_UP(context_len, PARTITION_SIZE);
-  [[maybe_unused]] constexpr int NUM_WARPS = NUM_THREADS / WARP_SIZE;
  const int warpid = threadIdx.x / WARP_SIZE;
-  [[maybe_unused]] const int laneid = threadIdx.x % WARP_SIZE;

  __shared__ float shared_global_exp_sum;
  // max num partitions supported is warp_size * NPAR_LOOPS