移除无用的调试代码

1b5a38de · yaoht · 5de45ee6 · 1b5a38de · 1b5a38de · 1b5a38de
Commit 1b5a38de authored Apr 22, 2026 by yaoht
3 changed files
--- a/src/infiniop/ops/add_rms_norm/cuda/kernel.cuh
+++ b/src/infiniop/ops/add_rms_norm/cuda/kernel.cuh
@@ -194,149 +194,3 @@ __device__ void add_rmsnormBlock_dim8192_bs1024(
 }

 #endif
+\ No newline at end of file
-//////////////////////////////////////////////////////////////////////
-// #ifndef __ADD_RMS_NORM_CUDA_KERNEL_H__
-// #define __ADD_RMS_NORM_CUDA_KERNEL_H__
-
-// // 移除 cub 头文件依赖
-// // #include <cub/block/block_reduce.cuh>
-
-// template <unsigned int BLOCK_SIZE, typename Tcompute, typename Tdata, typename Tweight>
-// __device__ void add_rmsnormBlock(
-//     Tdata * y,                      // 【修复 1】移除 __restrict__ 以支持 In-place
-//     Tdata * residual_out,           // 【修复 1】移除 __restrict__ 以支持 In-place
-//     ptrdiff_t stride_y_batch,
-//     ptrdiff_t stride_y_nhead,
-//     ptrdiff_t stride_residual_out_batch,
-//     ptrdiff_t stride_residual_out_nhead,
-//     const Tdata * a,                // 【修复 1】移除 __restrict__ 以支持 In-place
-//     ptrdiff_t stride_a_batch,
-//     ptrdiff_t stride_a_nhead,
-//     const Tdata * b,                // 【修复 1】移除 __restrict__ 以支持 In-place
-//     ptrdiff_t stride_b_batch,
-//     ptrdiff_t stride_b_nhead,
-//     const Tweight *__restrict__ w,  // 权重不被修改，保留 __restrict__ 是安全的
-//     size_t nhead,
-//     size_t dim,
-//     float epsilon) {
-    
-//     size_t batch_idx = blockIdx.x / nhead;
-//     size_t head_idx = blockIdx.x % nhead;
-
-//     auto y_ptr = y + batch_idx * stride_y_batch + head_idx * stride_y_nhead;
-//     auto a_ptr = a + batch_idx * stride_a_batch + head_idx * stride_a_nhead;
-//     auto b_ptr = b + batch_idx * stride_b_batch + head_idx * stride_b_nhead;
-//     auto w_ptr = w;
-//     Tdata *residual_out_ptr = residual_out + batch_idx * stride_residual_out_batch + head_idx * stride_residual_out_nhead;
-
-//     Tcompute sum_squared = 0;
-//     for (size_t i = threadIdx.x; i < dim; i += BLOCK_SIZE) {
-//         Tcompute sum_val = Tcompute(a_ptr[i]) + Tcompute(b_ptr[i]);
-//         residual_out_ptr[i] = Tdata(sum_val); // Store add result
-//         sum_squared += sum_val * sum_val;
-//     }
-
-//     // 【修复 2】使用通用且安全的 Shared Memory 手动规约替换 cub::BlockReduce
-//     // 这样不会受制于特定设备的 Warp Size 差异导致死锁
-//     __shared__ Tcompute shared_sum[BLOCK_SIZE];
-//     shared_sum[threadIdx.x] = sum_squared;
-//     __syncthreads();
-
-//     #pragma unroll
-//     for (unsigned int offset = BLOCK_SIZE / 2; offset > 0; offset /= 2) {
-//         if (threadIdx.x < offset) {
-//             shared_sum[threadIdx.x] += shared_sum[threadIdx.x + offset];
-//         }
-//         __syncthreads();
-//     }
-    
-//     sum_squared = shared_sum[0];
-
-//     __shared__ Tcompute rms;
-//     if (threadIdx.x == 0) {
-//         rms = Tcompute(rsqrtf(sum_squared / Tcompute(dim) + epsilon));
-//     }
-//     __syncthreads();
-
-//     // 重新利用算出的 residual_out 
-//     for (size_t i = threadIdx.x; i < dim; i += BLOCK_SIZE) {
-//         Tcompute sum_val = Tcompute(residual_out_ptr[i]); 
-//         y_ptr[i] = Tdata(sum_val * Tcompute(w_ptr[i]) * rms);
-//     }
-// }
-
-// #endif
-////////////////////////////////////////////////////////////////////////////
-// #ifndef __ADD_RMS_NORM_CUDA_KERNEL_H__
-// #define __ADD_RMS_NORM_CUDA_KERNEL_H__
-
-// #include <cub/block/block_reduce.cuh>
-
-// // 假设每个线程最多处理的元素个数。
-// // 例如 70B dim=8192, BLOCK_SIZE=1024，只需 8 个。设为 16 绝对够用。
-// #define MAX_ELEMS_PER_THREAD 16 
-
-// template <unsigned int BLOCK_SIZE, typename Tcompute, typename Tdata, typename Tweight>
-// __device__ void add_rmsnormBlock(
-//     Tdata *__restrict__ y,
-//     Tdata *__restrict__ residual_out,
-//     ptrdiff_t stride_y_batch,
-//     ptrdiff_t stride_y_seq,    // 🌟 修正命名：通常是按 seq_len 划分，而不是 nhead
-//     ptrdiff_t stride_residual_out_batch,
-//     ptrdiff_t stride_residual_out_seq,
-//     const Tdata *__restrict__ a,
-//     ptrdiff_t stride_a_batch,
-//     ptrdiff_t stride_a_seq,
-//     const Tdata *__restrict__ b,
-//     ptrdiff_t stride_b_batch,
-//     ptrdiff_t stride_b_seq,
-//     const Tweight *__restrict__ w,
-//     size_t seq_len,            // 🌟 修正命名：取代 nhead
-//     size_t dim,
-//     float epsilon) {
-    
-//     // 🌟 一个 Block 处理一个 Token
-//     size_t batch_idx = blockIdx.x / seq_len;
-//     size_t seq_idx = blockIdx.x % seq_len;
-
-//     auto y_ptr = y + batch_idx * stride_y_batch + seq_idx * stride_y_seq;
-//     auto a_ptr = a + batch_idx * stride_a_batch + seq_idx * stride_a_seq;
-//     auto b_ptr = b + batch_idx * stride_b_batch + seq_idx * stride_b_seq;
-//     Tdata *residual_out_ptr = residual_out + batch_idx * stride_residual_out_batch + seq_idx * stride_residual_out_seq;
-
-//     Tcompute sum_squared = 0;
-    
-//     // 🌟 真融合核心：用寄存器数组缓存当前线程计算的加法结果！
-//     Tcompute thread_cache[MAX_ELEMS_PER_THREAD];
-//     int cache_idx = 0;
-
-//     for (size_t i = threadIdx.x; i < dim; i += BLOCK_SIZE) {
-//         Tcompute sum_val = Tcompute(a_ptr[i]) + Tcompute(b_ptr[i]);
-        
-//         residual_out_ptr[i] = Tdata(sum_val); // 依然写回全局显存供后续 Attention 使用
-//         thread_cache[cache_idx++] = sum_val;  // 🌟 同时保存在极速寄存器中！
-        
-//         sum_squared += sum_val * sum_val;
-//     }
-
-//     // Block 内规约求平方和
-//     using BlockReduce = cub::BlockReduce<Tcompute, BLOCK_SIZE>;
-//     __shared__ typename BlockReduce::TempStorage temp_storage;
-//     sum_squared = BlockReduce(temp_storage).Sum(sum_squared);
-
-//     __shared__ Tcompute rms;
-//     if (threadIdx.x == 0) {
-//         rms = Tcompute(rsqrtf(sum_squared / Tcompute(dim) + epsilon));
-//     }
-//     __syncthreads();
-
-//     // 🌟 第二阶段：直接从寄存器 `thread_cache` 读取，彻底干掉那次致命的显存读取！
-//     cache_idx = 0;
-//     for (size_t i = threadIdx.x; i < dim; i += BLOCK_SIZE) {
-//         // 使用 __ldg (如果框架支持) 读取公共权重，速度拉满
-//         Tcompute weight_val = Tcompute(__ldg(&w[i])); 
-//         y_ptr[i] = Tdata(thread_cache[cache_idx++] * weight_val * rms);
-//     }
-// }
-
-// #endif
\ No newline at end of file
--- a/src/infiniop/ops/add_rms_norm/nvidia/add_rms_norm_nvidia.cu
+++ b/src/infiniop/ops/add_rms_norm/nvidia/add_rms_norm_nvidia.cu
@@ -114,7 +114,7 @@ INFINIOP_CUDA_KERNEL add_rmsnormKernel_dim4096_bs1024_bf162_vec(
        w, nhead, epsilon);
 }

-// DIM=8192, block=1024: 4x nv_bfloat162 per thread; pair idx = tid + i*1024 (same as scalar tiling; avoids longlong2 reorder issues).
+// DIM=8192, block=1024: 4x nv_bfloat162 per thread; pair idx = tid + i*1024 .
 __device__ void add_rmsnormBlock_dim8192_bs1024_bf162_vec(
    __nv_bfloat16 *__restrict__ y,
    __nv_bfloat16 *__restrict__ residual_out,

--- a/src/infiniop/ops/paged_caching/nvidia/paged_caching_nvidia.cu
+++ b/src/infiniop/ops/paged_caching/nvidia/paged_caching_nvidia.cu
@@ -94,21 +94,6 @@ infiniStatus_t launchKernel(const PagedCachingInfo &info,
                k_cache_slot_stride,
                v_cache_slot_stride);
    } else if (dtype == INFINI_DTYPE_BF16) {
-        std::cout<< "NUM_THREADS: " << NUM_THREADS << std::endl;
-        std::cout<< "grid: " << grid.x << ", " << grid.y << ", " << grid.z << std::endl;
-        std::cout<< "block: " << block.x << ", " << block.y << ", " << block.z << std::endl;
-        std::cout<< "shared_mem_size: " << shared_mem_size << std::endl;
-        std::cout<< "slot_mapping: " << slot_mapping << std::endl;
-        std::cout<< "head_size: " << head_size << std::endl;
-        std::cout<< "block_size: " << block_size << std::endl;
-        std::cout<< "k_src_stride: " << k_src_stride << std::endl;
-        std::cout<< "v_src_stride: " << v_src_stride << std::endl;
-        std::cout<< "k_cache_block_stride: " << k_cache_block_stride << std::endl;
-        std::cout<< "v_cache_block_stride: " << v_cache_block_stride << std::endl;
-        std::cout<< "k_cache_head_stride: " << k_cache_head_stride << std::endl;
-        std::cout<< "v_cache_head_stride: " << v_cache_head_stride << std::endl;
-        std::cout<< "k_cache_slot_stride: " << k_cache_slot_stride << std::endl;
-        std::cout<< "v_cache_slot_stride: " << v_cache_slot_stride << std::endl;
        pagedCaching<__nv_bfloat16, NUM_THREADS>
            <<<grid, block, shared_mem_size, stream>>>(
                (__nv_bfloat16 *)k_cache,