Commit 1b5a38de authored by yaoht's avatar yaoht
Browse files

移除无用的调试代码

parent 5de45ee6
......@@ -194,149 +194,3 @@ __device__ void add_rmsnormBlock_dim8192_bs1024(
}
#endif
\ No newline at end of file
//////////////////////////////////////////////////////////////////////
// #ifndef __ADD_RMS_NORM_CUDA_KERNEL_H__
// #define __ADD_RMS_NORM_CUDA_KERNEL_H__
// // 移除 cub 头文件依赖
// // #include <cub/block/block_reduce.cuh>
// template <unsigned int BLOCK_SIZE, typename Tcompute, typename Tdata, typename Tweight>
// __device__ void add_rmsnormBlock(
// Tdata * y, // 【修复 1】移除 __restrict__ 以支持 In-place
// Tdata * residual_out, // 【修复 1】移除 __restrict__ 以支持 In-place
// ptrdiff_t stride_y_batch,
// ptrdiff_t stride_y_nhead,
// ptrdiff_t stride_residual_out_batch,
// ptrdiff_t stride_residual_out_nhead,
// const Tdata * a, // 【修复 1】移除 __restrict__ 以支持 In-place
// ptrdiff_t stride_a_batch,
// ptrdiff_t stride_a_nhead,
// const Tdata * b, // 【修复 1】移除 __restrict__ 以支持 In-place
// ptrdiff_t stride_b_batch,
// ptrdiff_t stride_b_nhead,
// const Tweight *__restrict__ w, // 权重不被修改,保留 __restrict__ 是安全的
// size_t nhead,
// size_t dim,
// float epsilon) {
// size_t batch_idx = blockIdx.x / nhead;
// size_t head_idx = blockIdx.x % nhead;
// auto y_ptr = y + batch_idx * stride_y_batch + head_idx * stride_y_nhead;
// auto a_ptr = a + batch_idx * stride_a_batch + head_idx * stride_a_nhead;
// auto b_ptr = b + batch_idx * stride_b_batch + head_idx * stride_b_nhead;
// auto w_ptr = w;
// Tdata *residual_out_ptr = residual_out + batch_idx * stride_residual_out_batch + head_idx * stride_residual_out_nhead;
// Tcompute sum_squared = 0;
// for (size_t i = threadIdx.x; i < dim; i += BLOCK_SIZE) {
// Tcompute sum_val = Tcompute(a_ptr[i]) + Tcompute(b_ptr[i]);
// residual_out_ptr[i] = Tdata(sum_val); // Store add result
// sum_squared += sum_val * sum_val;
// }
// // 【修复 2】使用通用且安全的 Shared Memory 手动规约替换 cub::BlockReduce
// // 这样不会受制于特定设备的 Warp Size 差异导致死锁
// __shared__ Tcompute shared_sum[BLOCK_SIZE];
// shared_sum[threadIdx.x] = sum_squared;
// __syncthreads();
// #pragma unroll
// for (unsigned int offset = BLOCK_SIZE / 2; offset > 0; offset /= 2) {
// if (threadIdx.x < offset) {
// shared_sum[threadIdx.x] += shared_sum[threadIdx.x + offset];
// }
// __syncthreads();
// }
// sum_squared = shared_sum[0];
// __shared__ Tcompute rms;
// if (threadIdx.x == 0) {
// rms = Tcompute(rsqrtf(sum_squared / Tcompute(dim) + epsilon));
// }
// __syncthreads();
// // 重新利用算出的 residual_out
// for (size_t i = threadIdx.x; i < dim; i += BLOCK_SIZE) {
// Tcompute sum_val = Tcompute(residual_out_ptr[i]);
// y_ptr[i] = Tdata(sum_val * Tcompute(w_ptr[i]) * rms);
// }
// }
// #endif
////////////////////////////////////////////////////////////////////////////
// #ifndef __ADD_RMS_NORM_CUDA_KERNEL_H__
// #define __ADD_RMS_NORM_CUDA_KERNEL_H__
// #include <cub/block/block_reduce.cuh>
// // 假设每个线程最多处理的元素个数。
// // 例如 70B dim=8192, BLOCK_SIZE=1024,只需 8 个。设为 16 绝对够用。
// #define MAX_ELEMS_PER_THREAD 16
// template <unsigned int BLOCK_SIZE, typename Tcompute, typename Tdata, typename Tweight>
// __device__ void add_rmsnormBlock(
// Tdata *__restrict__ y,
// Tdata *__restrict__ residual_out,
// ptrdiff_t stride_y_batch,
// ptrdiff_t stride_y_seq, // 🌟 修正命名:通常是按 seq_len 划分,而不是 nhead
// ptrdiff_t stride_residual_out_batch,
// ptrdiff_t stride_residual_out_seq,
// const Tdata *__restrict__ a,
// ptrdiff_t stride_a_batch,
// ptrdiff_t stride_a_seq,
// const Tdata *__restrict__ b,
// ptrdiff_t stride_b_batch,
// ptrdiff_t stride_b_seq,
// const Tweight *__restrict__ w,
// size_t seq_len, // 🌟 修正命名:取代 nhead
// size_t dim,
// float epsilon) {
// // 🌟 一个 Block 处理一个 Token
// size_t batch_idx = blockIdx.x / seq_len;
// size_t seq_idx = blockIdx.x % seq_len;
// auto y_ptr = y + batch_idx * stride_y_batch + seq_idx * stride_y_seq;
// auto a_ptr = a + batch_idx * stride_a_batch + seq_idx * stride_a_seq;
// auto b_ptr = b + batch_idx * stride_b_batch + seq_idx * stride_b_seq;
// Tdata *residual_out_ptr = residual_out + batch_idx * stride_residual_out_batch + seq_idx * stride_residual_out_seq;
// Tcompute sum_squared = 0;
// // 🌟 真融合核心:用寄存器数组缓存当前线程计算的加法结果!
// Tcompute thread_cache[MAX_ELEMS_PER_THREAD];
// int cache_idx = 0;
// for (size_t i = threadIdx.x; i < dim; i += BLOCK_SIZE) {
// Tcompute sum_val = Tcompute(a_ptr[i]) + Tcompute(b_ptr[i]);
// residual_out_ptr[i] = Tdata(sum_val); // 依然写回全局显存供后续 Attention 使用
// thread_cache[cache_idx++] = sum_val; // 🌟 同时保存在极速寄存器中!
// sum_squared += sum_val * sum_val;
// }
// // Block 内规约求平方和
// using BlockReduce = cub::BlockReduce<Tcompute, BLOCK_SIZE>;
// __shared__ typename BlockReduce::TempStorage temp_storage;
// sum_squared = BlockReduce(temp_storage).Sum(sum_squared);
// __shared__ Tcompute rms;
// if (threadIdx.x == 0) {
// rms = Tcompute(rsqrtf(sum_squared / Tcompute(dim) + epsilon));
// }
// __syncthreads();
// // 🌟 第二阶段:直接从寄存器 `thread_cache` 读取,彻底干掉那次致命的显存读取!
// cache_idx = 0;
// for (size_t i = threadIdx.x; i < dim; i += BLOCK_SIZE) {
// // 使用 __ldg (如果框架支持) 读取公共权重,速度拉满
// Tcompute weight_val = Tcompute(__ldg(&w[i]));
// y_ptr[i] = Tdata(thread_cache[cache_idx++] * weight_val * rms);
// }
// }
// #endif
\ No newline at end of file
......@@ -114,7 +114,7 @@ INFINIOP_CUDA_KERNEL add_rmsnormKernel_dim4096_bs1024_bf162_vec(
w, nhead, epsilon);
}
// DIM=8192, block=1024: 4x nv_bfloat162 per thread; pair idx = tid + i*1024 (same as scalar tiling; avoids longlong2 reorder issues).
// DIM=8192, block=1024: 4x nv_bfloat162 per thread; pair idx = tid + i*1024 .
__device__ void add_rmsnormBlock_dim8192_bs1024_bf162_vec(
__nv_bfloat16 *__restrict__ y,
__nv_bfloat16 *__restrict__ residual_out,
......
......@@ -94,21 +94,6 @@ infiniStatus_t launchKernel(const PagedCachingInfo &info,
k_cache_slot_stride,
v_cache_slot_stride);
} else if (dtype == INFINI_DTYPE_BF16) {
std::cout<< "NUM_THREADS: " << NUM_THREADS << std::endl;
std::cout<< "grid: " << grid.x << ", " << grid.y << ", " << grid.z << std::endl;
std::cout<< "block: " << block.x << ", " << block.y << ", " << block.z << std::endl;
std::cout<< "shared_mem_size: " << shared_mem_size << std::endl;
std::cout<< "slot_mapping: " << slot_mapping << std::endl;
std::cout<< "head_size: " << head_size << std::endl;
std::cout<< "block_size: " << block_size << std::endl;
std::cout<< "k_src_stride: " << k_src_stride << std::endl;
std::cout<< "v_src_stride: " << v_src_stride << std::endl;
std::cout<< "k_cache_block_stride: " << k_cache_block_stride << std::endl;
std::cout<< "v_cache_block_stride: " << v_cache_block_stride << std::endl;
std::cout<< "k_cache_head_stride: " << k_cache_head_stride << std::endl;
std::cout<< "v_cache_head_stride: " << v_cache_head_stride << std::endl;
std::cout<< "k_cache_slot_stride: " << k_cache_slot_stride << std::endl;
std::cout<< "v_cache_slot_stride: " << v_cache_slot_stride << std::endl;
pagedCaching<__nv_bfloat16, NUM_THREADS>
<<<grid, block, shared_mem_size, stream>>>(
(__nv_bfloat16 *)k_cache,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment