Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
jerrrrry
infinicore
Commits
1b5a38de
Commit
1b5a38de
authored
Apr 22, 2026
by
yaoht
Browse files
移除无用的调试代码
parent
5de45ee6
Changes
3
Show whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
2 additions
and
163 deletions
+2
-163
src/infiniop/ops/add_rms_norm/cuda/kernel.cuh
src/infiniop/ops/add_rms_norm/cuda/kernel.cuh
+1
-147
src/infiniop/ops/add_rms_norm/nvidia/add_rms_norm_nvidia.cu
src/infiniop/ops/add_rms_norm/nvidia/add_rms_norm_nvidia.cu
+1
-1
src/infiniop/ops/paged_caching/nvidia/paged_caching_nvidia.cu
...infiniop/ops/paged_caching/nvidia/paged_caching_nvidia.cu
+0
-15
No files found.
src/infiniop/ops/add_rms_norm/cuda/kernel.cuh
View file @
1b5a38de
...
...
@@ -194,149 +194,3 @@ __device__ void add_rmsnormBlock_dim8192_bs1024(
}
#endif
\ No newline at end of file
//////////////////////////////////////////////////////////////////////
// #ifndef __ADD_RMS_NORM_CUDA_KERNEL_H__
// #define __ADD_RMS_NORM_CUDA_KERNEL_H__
// // 移除 cub 头文件依赖
// // #include <cub/block/block_reduce.cuh>
// template <unsigned int BLOCK_SIZE, typename Tcompute, typename Tdata, typename Tweight>
// __device__ void add_rmsnormBlock(
// Tdata * y, // 【修复 1】移除 __restrict__ 以支持 In-place
// Tdata * residual_out, // 【修复 1】移除 __restrict__ 以支持 In-place
// ptrdiff_t stride_y_batch,
// ptrdiff_t stride_y_nhead,
// ptrdiff_t stride_residual_out_batch,
// ptrdiff_t stride_residual_out_nhead,
// const Tdata * a, // 【修复 1】移除 __restrict__ 以支持 In-place
// ptrdiff_t stride_a_batch,
// ptrdiff_t stride_a_nhead,
// const Tdata * b, // 【修复 1】移除 __restrict__ 以支持 In-place
// ptrdiff_t stride_b_batch,
// ptrdiff_t stride_b_nhead,
// const Tweight *__restrict__ w, // 权重不被修改,保留 __restrict__ 是安全的
// size_t nhead,
// size_t dim,
// float epsilon) {
// size_t batch_idx = blockIdx.x / nhead;
// size_t head_idx = blockIdx.x % nhead;
// auto y_ptr = y + batch_idx * stride_y_batch + head_idx * stride_y_nhead;
// auto a_ptr = a + batch_idx * stride_a_batch + head_idx * stride_a_nhead;
// auto b_ptr = b + batch_idx * stride_b_batch + head_idx * stride_b_nhead;
// auto w_ptr = w;
// Tdata *residual_out_ptr = residual_out + batch_idx * stride_residual_out_batch + head_idx * stride_residual_out_nhead;
// Tcompute sum_squared = 0;
// for (size_t i = threadIdx.x; i < dim; i += BLOCK_SIZE) {
// Tcompute sum_val = Tcompute(a_ptr[i]) + Tcompute(b_ptr[i]);
// residual_out_ptr[i] = Tdata(sum_val); // Store add result
// sum_squared += sum_val * sum_val;
// }
// // 【修复 2】使用通用且安全的 Shared Memory 手动规约替换 cub::BlockReduce
// // 这样不会受制于特定设备的 Warp Size 差异导致死锁
// __shared__ Tcompute shared_sum[BLOCK_SIZE];
// shared_sum[threadIdx.x] = sum_squared;
// __syncthreads();
// #pragma unroll
// for (unsigned int offset = BLOCK_SIZE / 2; offset > 0; offset /= 2) {
// if (threadIdx.x < offset) {
// shared_sum[threadIdx.x] += shared_sum[threadIdx.x + offset];
// }
// __syncthreads();
// }
// sum_squared = shared_sum[0];
// __shared__ Tcompute rms;
// if (threadIdx.x == 0) {
// rms = Tcompute(rsqrtf(sum_squared / Tcompute(dim) + epsilon));
// }
// __syncthreads();
// // 重新利用算出的 residual_out
// for (size_t i = threadIdx.x; i < dim; i += BLOCK_SIZE) {
// Tcompute sum_val = Tcompute(residual_out_ptr[i]);
// y_ptr[i] = Tdata(sum_val * Tcompute(w_ptr[i]) * rms);
// }
// }
// #endif
////////////////////////////////////////////////////////////////////////////
// #ifndef __ADD_RMS_NORM_CUDA_KERNEL_H__
// #define __ADD_RMS_NORM_CUDA_KERNEL_H__
// #include <cub/block/block_reduce.cuh>
// // 假设每个线程最多处理的元素个数。
// // 例如 70B dim=8192, BLOCK_SIZE=1024,只需 8 个。设为 16 绝对够用。
// #define MAX_ELEMS_PER_THREAD 16
// template <unsigned int BLOCK_SIZE, typename Tcompute, typename Tdata, typename Tweight>
// __device__ void add_rmsnormBlock(
// Tdata *__restrict__ y,
// Tdata *__restrict__ residual_out,
// ptrdiff_t stride_y_batch,
// ptrdiff_t stride_y_seq, // 🌟 修正命名:通常是按 seq_len 划分,而不是 nhead
// ptrdiff_t stride_residual_out_batch,
// ptrdiff_t stride_residual_out_seq,
// const Tdata *__restrict__ a,
// ptrdiff_t stride_a_batch,
// ptrdiff_t stride_a_seq,
// const Tdata *__restrict__ b,
// ptrdiff_t stride_b_batch,
// ptrdiff_t stride_b_seq,
// const Tweight *__restrict__ w,
// size_t seq_len, // 🌟 修正命名:取代 nhead
// size_t dim,
// float epsilon) {
// // 🌟 一个 Block 处理一个 Token
// size_t batch_idx = blockIdx.x / seq_len;
// size_t seq_idx = blockIdx.x % seq_len;
// auto y_ptr = y + batch_idx * stride_y_batch + seq_idx * stride_y_seq;
// auto a_ptr = a + batch_idx * stride_a_batch + seq_idx * stride_a_seq;
// auto b_ptr = b + batch_idx * stride_b_batch + seq_idx * stride_b_seq;
// Tdata *residual_out_ptr = residual_out + batch_idx * stride_residual_out_batch + seq_idx * stride_residual_out_seq;
// Tcompute sum_squared = 0;
// // 🌟 真融合核心:用寄存器数组缓存当前线程计算的加法结果!
// Tcompute thread_cache[MAX_ELEMS_PER_THREAD];
// int cache_idx = 0;
// for (size_t i = threadIdx.x; i < dim; i += BLOCK_SIZE) {
// Tcompute sum_val = Tcompute(a_ptr[i]) + Tcompute(b_ptr[i]);
// residual_out_ptr[i] = Tdata(sum_val); // 依然写回全局显存供后续 Attention 使用
// thread_cache[cache_idx++] = sum_val; // 🌟 同时保存在极速寄存器中!
// sum_squared += sum_val * sum_val;
// }
// // Block 内规约求平方和
// using BlockReduce = cub::BlockReduce<Tcompute, BLOCK_SIZE>;
// __shared__ typename BlockReduce::TempStorage temp_storage;
// sum_squared = BlockReduce(temp_storage).Sum(sum_squared);
// __shared__ Tcompute rms;
// if (threadIdx.x == 0) {
// rms = Tcompute(rsqrtf(sum_squared / Tcompute(dim) + epsilon));
// }
// __syncthreads();
// // 🌟 第二阶段:直接从寄存器 `thread_cache` 读取,彻底干掉那次致命的显存读取!
// cache_idx = 0;
// for (size_t i = threadIdx.x; i < dim; i += BLOCK_SIZE) {
// // 使用 __ldg (如果框架支持) 读取公共权重,速度拉满
// Tcompute weight_val = Tcompute(__ldg(&w[i]));
// y_ptr[i] = Tdata(thread_cache[cache_idx++] * weight_val * rms);
// }
// }
// #endif
\ No newline at end of file
src/infiniop/ops/add_rms_norm/nvidia/add_rms_norm_nvidia.cu
View file @
1b5a38de
...
...
@@ -114,7 +114,7 @@ INFINIOP_CUDA_KERNEL add_rmsnormKernel_dim4096_bs1024_bf162_vec(
w
,
nhead
,
epsilon
);
}
// DIM=8192, block=1024: 4x nv_bfloat162 per thread; pair idx = tid + i*1024
(same as scalar tiling; avoids longlong2 reorder issues)
.
// DIM=8192, block=1024: 4x nv_bfloat162 per thread; pair idx = tid + i*1024 .
__device__
void
add_rmsnormBlock_dim8192_bs1024_bf162_vec
(
__nv_bfloat16
*
__restrict__
y
,
__nv_bfloat16
*
__restrict__
residual_out
,
...
...
src/infiniop/ops/paged_caching/nvidia/paged_caching_nvidia.cu
View file @
1b5a38de
...
...
@@ -94,21 +94,6 @@ infiniStatus_t launchKernel(const PagedCachingInfo &info,
k_cache_slot_stride
,
v_cache_slot_stride
);
}
else
if
(
dtype
==
INFINI_DTYPE_BF16
)
{
std
::
cout
<<
"NUM_THREADS: "
<<
NUM_THREADS
<<
std
::
endl
;
std
::
cout
<<
"grid: "
<<
grid
.
x
<<
", "
<<
grid
.
y
<<
", "
<<
grid
.
z
<<
std
::
endl
;
std
::
cout
<<
"block: "
<<
block
.
x
<<
", "
<<
block
.
y
<<
", "
<<
block
.
z
<<
std
::
endl
;
std
::
cout
<<
"shared_mem_size: "
<<
shared_mem_size
<<
std
::
endl
;
std
::
cout
<<
"slot_mapping: "
<<
slot_mapping
<<
std
::
endl
;
std
::
cout
<<
"head_size: "
<<
head_size
<<
std
::
endl
;
std
::
cout
<<
"block_size: "
<<
block_size
<<
std
::
endl
;
std
::
cout
<<
"k_src_stride: "
<<
k_src_stride
<<
std
::
endl
;
std
::
cout
<<
"v_src_stride: "
<<
v_src_stride
<<
std
::
endl
;
std
::
cout
<<
"k_cache_block_stride: "
<<
k_cache_block_stride
<<
std
::
endl
;
std
::
cout
<<
"v_cache_block_stride: "
<<
v_cache_block_stride
<<
std
::
endl
;
std
::
cout
<<
"k_cache_head_stride: "
<<
k_cache_head_stride
<<
std
::
endl
;
std
::
cout
<<
"v_cache_head_stride: "
<<
v_cache_head_stride
<<
std
::
endl
;
std
::
cout
<<
"k_cache_slot_stride: "
<<
k_cache_slot_stride
<<
std
::
endl
;
std
::
cout
<<
"v_cache_slot_stride: "
<<
v_cache_slot_stride
<<
std
::
endl
;
pagedCaching
<
__nv_bfloat16
,
NUM_THREADS
>
<<<
grid
,
block
,
shared_mem_size
,
stream
>>>
(
(
__nv_bfloat16
*
)
k_cache
,
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment