#include "hip/hip_runtime.h" #include #include __global__ void _Add(long long sz, float* Z, const float* X, const float* Y) { long long offset = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x; if (offset < sz) { Z[offset] = X[offset] + Y[offset]; } return; } void rocm_add(int64_t sz, float* Z, const float* X, const float* Y, hipStream_t compute_stream) { float *d_X, *d_Y, *d_Z; // 为设备上的数组分配内存并检查分配结果 hipError_t err; err = hipMalloc((void**)&d_X, sz * sizeof(float)); if (err != hipSuccess) { fprintf(stderr, "hipMalloc for d_X failed: %s\n", hipGetErrorString(err)); return; } err = hipMalloc((void**)&d_Y, sz * sizeof(float)); if (err != hipSuccess) { fprintf(stderr, "hipMalloc for d_Y failed: %s\n", hipGetErrorString(err)); hipFree(d_X); return; } err = hipMalloc((void**)&d_Z, sz * sizeof(float)); if (err != hipSuccess) { fprintf(stderr, "hipMalloc for d_Z failed: %s\n", hipGetErrorString(err)); hipFree(d_X); hipFree(d_Y); return; } // 将主机上的 X 和 Y 数组数据复制到设备 err = hipMemcpyAsync(d_X, X, sz * sizeof(float), hipMemcpyHostToDevice, compute_stream); if (err != hipSuccess) { fprintf(stderr, "hipMemcpyAsync for d_X failed: %s\n", hipGetErrorString(err)); hipFree(d_X); hipFree(d_Y); hipFree(d_Z); return; } err = hipMemcpyAsync(d_Y, Y, sz * sizeof(float), hipMemcpyHostToDevice, compute_stream); if (err != hipSuccess) { fprintf(stderr, "hipMemcpyAsync for d_Y failed: %s\n", hipGetErrorString(err)); hipFree(d_X); hipFree(d_Y); hipFree(d_Z); return; } // 调用核函数 _Add<<<256, 256, 0, compute_stream>>>(static_cast(sz), d_Z, d_X, d_Y); err = hipGetLastError(); if (err != hipSuccess) { fprintf(stderr, "Kernel launch failed: %s\n", hipGetErrorString(err)); hipFree(d_X); hipFree(d_Y); hipFree(d_Z); return; } // 将计算结果从设备复制回主机 err = hipMemcpyAsync(Z, d_Z, sz * sizeof(float), hipMemcpyDeviceToHost, compute_stream); if (err != hipSuccess) { fprintf(stderr, "hipMemcpyAsync for Z failed: %s\n", hipGetErrorString(err)); hipFree(d_X); hipFree(d_Y); hipFree(d_Z); return; } // 同步流，确保所有操作完成 err = hipStreamSynchronize(compute_stream); if (err != hipSuccess) { fprintf(stderr, "hipStreamSynchronize failed: %s\n", hipGetErrorString(err)); hipFree(d_X); hipFree(d_Y); hipFree(d_Z); return; } // 释放设备上的内存 hipFree(d_X); hipFree(d_Y); hipFree(d_Z); } // Concat __global__ void _Concat2D(int axis, int M1, int N1, const float* X1, int M2, int N2, const float* X2, float* Z) { int row = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y; int col = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x; if (axis == 0) { // 按行连接 if (row < M1 && col < N1) { Z[row * N1 + col] = X1[row * N1 + col]; } else if (row >= M1 && row < M1 + M2 && col < N2) { Z[row * N2 + col] = X2[(row - M1) * N2 + col]; } } else if (axis == 1) { // 按列连接 if (row < M1 && col < N1) { Z[row * (N1 + N2) + col] = X1[row * N1 + col]; } else if (row < M2 && col >= N1 && col < N1 + N2) { Z[row * (N1 + N2) + col] = X2[row * N2 + (col - N1)]; } } return; } void rocm_concat(int axis, int M1, int N1, const float* X1, int M2, int N2, const float* X2, float* Z, hipStream_t compute_stream) { dim3 blockDim(16, 16); dim3 gridDim((axis == 0 ? N1 : N1 + N2 + 15) / 16, (axis == 0 ? M1 + M2 : M1 + 15) / 16); float *d_X1, *d_X2, *d_Z; hipError_t err; size_t size1 = M1 * N1 * sizeof(float); size_t size2 = M2 * N2 * sizeof(float); size_t sizeZ = (axis == 0 ? (M1 + M2) * N1 : M1 * (N1 + N2)) * sizeof(float); // 分配显存 err = hipMalloc(&d_X1, size1); if (err != hipSuccess) { /* 错误处理 */ } err = hipMalloc(&d_X2, size2); if (err != hipSuccess) { hipFree(d_X1); return; } err = hipMalloc(&d_Z, sizeZ); if (err != hipSuccess) { hipFree(d_X1); hipFree(d_X2); return; } // 拷贝数据到设备 hipMemcpyAsync(d_X1, X1, size1, hipMemcpyHostToDevice, compute_stream); hipMemcpyAsync(d_X2, X2, size2, hipMemcpyHostToDevice, compute_stream); // 启动核函数 // dim3 blockDim(16, 16); // dim3 gridDim((axis == 0 ? N1 : N1 + N2 + 15) / 16, (axis == 0 ? M1 + M2 : M1 + 15) / 16); _Concat2D<<>>(axis, M1, N1, d_X1, M2, N2, d_X2, d_Z); // 拷贝结果回主机 hipMemcpyAsync(Z, d_Z, sizeZ, hipMemcpyDeviceToHost, compute_stream); // 同步流 hipStreamSynchronize(compute_stream); // 释放资源 hipFree(d_X1); hipFree(d_X2); hipFree(d_Z); return; } //gemm #include __global__ void _Gemm(bool transA, bool transB, int M, int N, int K, float alpha, const float* A, const float* B, float beta, float* C) { int row = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y; int col = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x; if (row >= M || col >= N) return; float sum = 0.0f; for (int k = 0; k < K; ++k) { float a = transA ? A[k * M + row] : A[row * K + k]; float b = transB ? B[col * K + k] : B[k * N + col]; sum += a * b; } C[row * N + col] = alpha * sum + beta * C[row * N + col]; return; } void rocm_gemm(bool transA, bool transB, int M, int N, int K, float alpha, const float* A, const float* B, float beta, float* C, hipStream_t compute_stream) { dim3 blockDim(16, 16); dim3 gridDim((N + 15) / 16, (M + 15) / 16); float *d_A, *d_B, *d_C; hipError_t err; size_t sizeA = transA ? K * M * sizeof(float) : M * K * sizeof(float); size_t sizeB = transB ? N * K * sizeof(float) : K * N * sizeof(float); size_t sizeC = M * N * sizeof(float); // 分配显存 err = hipMalloc(&d_A, sizeA); if (err != hipSuccess) { goto error; } err = hipMalloc(&d_B, sizeB); if (err != hipSuccess) { hipFree(d_A); goto error; } err = hipMalloc(&d_C, sizeC); if (err != hipSuccess) { hipFree(d_A); hipFree(d_B); goto error; } // 主机 -> 设备拷贝 hipMemcpyAsync(d_A, A, sizeA, hipMemcpyHostToDevice, compute_stream); hipMemcpyAsync(d_B, B, sizeB, hipMemcpyHostToDevice, compute_stream); hipMemcpyAsync(d_C, C, sizeC, hipMemcpyHostToDevice, compute_stream); // 启动核函数 //dim3 blockDim(16, 16); //dim3 gridDim((N + 15) / 16, (M + 15) / 16); _Gemm<<>>(transA, transB, M, N, K, alpha, d_A, d_B, beta, d_C); // 设备 -> 主机拷贝 hipMemcpyAsync(C, d_C, sizeC, hipMemcpyDeviceToHost, compute_stream); // 同步流 hipStreamSynchronize(compute_stream); // 清理资源 hipFree(d_A); hipFree(d_B); hipFree(d_C); return; error: fprintf(stderr, "HIP memory allocation or memcpy failed in rocm_gemm\n"); if (d_A) hipFree(d_A); if (d_B) hipFree(d_B); if (d_C) hipFree(d_C); } //GroupNormalization #include __global__ void _GroupNorm( int64_t N, int64_t C, int64_t H, int64_t W, int64_t G, float eps, const float* X, float* Y, const float* gamma, const float* beta ) { // 计算当前组和样本索引 int64_t group_idx = hipBlockIdx_x; int64_t n = hipBlockIdx_y; int64_t channels_per_group = C / G; int64_t c_start = group_idx * channels_per_group; int64_t c_end = c_start + channels_per_group; // 组内总元素数 int64_t group_size = channels_per_group * H * W; // 共享内存用于归约求和 __shared__ float shared_sum[256]; __shared__ float shared_sum_sq[256]; // 每个线程计算局部和与平方和 float sum = 0.0f, sum_sq = 0.0f; for (int64_t idx = hipThreadIdx_x; idx < group_size; idx += hipBlockDim_x) { int64_t c = c_start + idx / (H * W); int64_t hw = idx % (H * W); int64_t h = hw / W; int64_t w = hw % W; int64_t linear_idx = n * C * H * W + c * H * W + h * W + w; float val = X[linear_idx]; sum += val; sum_sq += val * val; } shared_sum[hipThreadIdx_x] = sum; shared_sum_sq[hipThreadIdx_x] = sum_sq; __syncthreads(); // 树状归约求全局和 for (int s = hipBlockDim_x / 2; s > 0; s >>= 1) { if (hipThreadIdx_x < s) { shared_sum[hipThreadIdx_x] += shared_sum[hipThreadIdx_x + s]; shared_sum_sq[hipThreadIdx_x] += shared_sum_sq[hipThreadIdx_x + s]; } __syncthreads(); } // 计算均值和方差 float mean = shared_sum[0] / group_size; float var = shared_sum_sq[0] / group_size - mean * mean; // 归一化并应用仿射变换 for (int64_t idx = hipThreadIdx_x; idx < group_size; idx += hipBlockDim_x) { int64_t c = c_start + idx / (H * W); int64_t hw = idx % (H * W); int64_t h = hw / W; int64_t w = hw % W; int64_t linear_idx = n * C * H * W + c * H * W + h * W + w; float val = (X[linear_idx] - mean) / sqrtf(var + eps); Y[linear_idx] = gamma[c] * val + beta[c]; } return; } void rocm_group_norm( int64_t N, int64_t C, int64_t H, int64_t W, int64_t G, float eps, const float* X, float* Y, const float* gamma, const float* beta, hipStream_t compute_stream ) { dim3 block_dim(256); // 每个块256线程 dim3 grid_dim(G, N); // 每个组和样本对应一个块 // 参数校验 if (C % G != 0) { fprintf(stderr, "Error: Channels must be divisible by groups.\n"); return; } // 分配设备内存 float *d_X, *d_Y, *d_gamma, *d_beta; hipError_t err; size_t input_size = N * C * H * W * sizeof(float); size_t param_size = C * sizeof(float); err = hipMalloc(&d_X, input_size); if (err != hipSuccess) { /* 处理错误 */ } err = hipMalloc(&d_Y, input_size); if (err != hipSuccess) { hipFree(d_X); return; } err = hipMalloc(&d_gamma, param_size); if (err != hipSuccess) { hipFree(d_X); hipFree(d_Y); return; } err = hipMalloc(&d_beta, param_size); if (err != hipSuccess) { hipFree(d_X); hipFree(d_Y); hipFree(d_gamma); return; } // 数据拷贝到设备 hipMemcpyAsync(d_X, X, input_size, hipMemcpyHostToDevice, compute_stream); hipMemcpyAsync(d_gamma, gamma, param_size, hipMemcpyHostToDevice, compute_stream); hipMemcpyAsync(d_beta, beta, param_size, hipMemcpyHostToDevice, compute_stream); // 配置核函数参数 // dim3 block_dim(256); // 每个块256线程 // dim3 grid_dim(G, N); // 每个组和样本对应一个块 // 启动核函数 _GroupNorm<<>>( N, C, H, W, G, eps, d_X, d_Y, d_gamma, d_beta ); // 拷贝结果回主机 hipMemcpyAsync(Y, d_Y, input_size, hipMemcpyDeviceToHost, compute_stream); // 同步流并释放资源 hipStreamSynchronize(compute_stream); hipFree(d_X); hipFree(d_Y); hipFree(d_gamma); hipFree(d_beta); } //LogSoftmax __global__ void _LogSoftmax(int64_t N, int64_t D, const float* X, float* Y) { int64_t n = hipBlockIdx_x; // 每个样本一个线程块 int tid = hipThreadIdx_x; // 共享内存存储最大值和指数和 __shared__ float shared_max[256]; __shared__ float shared_sum[256]; // 步骤1：计算样本内最大值 float max_val = -INFINITY; for (int64_t i = tid; i < D; i += hipBlockDim_x) { max_val = fmaxf(max_val, X[n * D + i]); } shared_max[tid] = max_val; __syncthreads(); // 归约求全局最大值 for (int s = hipBlockDim_x / 2; s > 0; s >>= 1) { if (tid < s && shared_max[tid + s] > shared_max[tid]) { shared_max[tid] = shared_max[tid + s]; } __syncthreads(); } float global_max = shared_max[0]; // 步骤2：计算指数和 float exp_sum = 0.0f; for (int64_t i = tid; i < D; i += hipBlockDim_x) { exp_sum += expf(X[n * D + i] - global_max); } shared_sum[tid] = exp_sum; __syncthreads(); // 归约求全局指数和 for (int s = hipBlockDim_x / 2; s > 0; s >>= 1) { if (tid < s) { shared_sum[tid] += shared_sum[tid + s]; } __syncthreads(); } float global_sum = shared_sum[0]; // 步骤3：计算LogSoftmax for (int64_t i = tid; i < D; i += hipBlockDim_x) { Y[n * D + i] = (X[n * D + i] - global_max) - logf(global_sum); } return; } void rocm_log_softmax( int64_t N, int64_t D, const float* X, float* Y, hipStream_t compute_stream ) { dim3 block_dim(256); // 每个块256线程 dim3 grid_dim(N); // 每个样本一个线程块 // 分配设备内存 float *d_X, *d_Y; hipError_t err; size_t input_size = N * D * sizeof(float); err = hipMalloc(&d_X, input_size); if (err != hipSuccess) { /* 处理错误 */ } err = hipMalloc(&d_Y, input_size); if (err != hipSuccess) { hipFree(d_X); return; } // 数据拷贝到设备 hipMemcpyAsync(d_X, X, input_size, hipMemcpyHostToDevice, compute_stream); // 配置核函数参数 //dim3 block_dim(256); // 每个块256线程 //dim3 grid_dim(N); // 每个样本一个线程块 // 启动核函数 _LogSoftmax<<>>(N, D, d_X, d_Y); // 拷贝结果回主机 hipMemcpyAsync(Y, d_Y, input_size, hipMemcpyDeviceToHost, compute_stream); // 同步流并释放资源 hipStreamSynchronize(compute_stream); hipFree(d_X); hipFree(d_Y); } //attention __global__ void _DotProductAttention(int B, int S, int H, const float* Q, const float* K, const float* V, float scaling, float* output) { int b = blockIdx.z; int i = blockIdx.y * blockDim.y + threadIdx.y; // query index int j = blockIdx.x * blockDim.x + threadIdx.x; // hidden dim if (b >= B || i >= S || j >= H) return; // 计算 Q·K^T[i, k] float scores[128]; // 假设 seq_len <= 128 for (int k = 0; k < S; ++k) { float dot = 0.f; for (int h = 0; h < H; ++h) { dot += Q[(b * S + i) * H + h] * K[(b * S + k) * H + h]; } scores[k] = dot / scaling; } // softmax over scores float max_val = scores[0]; for (int k = 1; k < S; ++k) max_val = fmaxf(max_val, scores[k]); float sum = 0.f; for (int k = 0; k < S; ++k) { scores[k] = expf(scores[k] - max_val); sum += scores[k]; } for (int k = 0; k < S; ++k) scores[k] /= sum; // output = softmax * V float result = 0.f; for (int k = 0; k < S; ++k) { result += scores[k] * V[(b * S + k) * H + j]; } output[(b * S + i) * H + j] = result; } extern "C" void rocm_attention(int B, int S, int H, const float* Q, const float* K, const float* V, float* Out, hipStream_t stream) { dim3 blockDim(16, 16); dim3 gridDim((H + 15) / 16, (S + 15) / 16, B); float *d_Q, *d_K, *d_V, *d_Out; size_t size = B * S * H * sizeof(float); hipMalloc(&d_Q, size); hipMalloc(&d_K, size); hipMalloc(&d_V, size); hipMalloc(&d_Out, size); hipMemcpyAsync(d_Q, Q, size, hipMemcpyHostToDevice, stream); hipMemcpyAsync(d_K, K, size, hipMemcpyHostToDevice, stream); hipMemcpyAsync(d_V, V, size, hipMemcpyHostToDevice, stream); float scale = sqrtf((float)H); // dim3 blockDim(16, 16); // dim3 gridDim((H + 15) / 16, (S + 15) / 16, B); _DotProductAttention<<>>(B, S, H, d_Q, d_K, d_V, scale, d_Out); hipMemcpyAsync(Out, d_Out, size, hipMemcpyDeviceToHost, stream); hipStreamSynchronize(stream); hipFree(d_Q); hipFree(d_K); hipFree(d_V); hipFree(d_Out); return; } // BatchNormalization __global__ void _BatchNormalization( int N, int C, int H, int W, const float* X, const float* gamma, const float* beta, const float* mean, const float* var, float epsilon, float* Y) { // global thread index int idx = blockIdx.x * blockDim.x + threadIdx.x; int total = N * C * H * W; if (idx >= total) return; // 计算坐标 int w = idx % W; int tmp = idx / W; int h = tmp % H; tmp = tmp / H; int c = tmp % C; int n = tmp / C; // 计算 Y = gamma[c] * (X - mean[c]) / sqrt(var[c] + eps) + beta[c] int offset = ((n * C + c) * H + h) * W + w; float x = X[offset]; float m = mean[c]; float v = var[c]; float inv_std = rsqrtf(v + epsilon); Y[offset] = gamma[c] * ((x - m) * inv_std) + beta[c]; } // host API：rocm_batch_norm extern "C" void rocm_batch_norm( int64_t N, int64_t C, int64_t H, int64_t W, const float* X, const float* gamma, const float* beta, const float* mean, const float* var, float epsilon, float* Y, hipStream_t stream) { size_t total = (size_t)N * C * H * W; // 分配并拷贝 X、gamma、beta、mean、var 到设备 float *d_X, *d_gamma, *d_beta, *d_mean, *d_var, *d_Y; hipMalloc(&d_X, total * sizeof(float)); hipMalloc(&d_Y, total * sizeof(float)); hipMalloc(&d_gamma, C * sizeof(float)); hipMalloc(&d_beta, C * sizeof(float)); hipMalloc(&d_mean, C * sizeof(float)); hipMalloc(&d_var, C * sizeof(float)); hipMemcpyAsync(d_X, X, total * sizeof(float), hipMemcpyHostToDevice, stream); hipMemcpyAsync(d_gamma, gamma, C * sizeof(float), hipMemcpyHostToDevice, stream); hipMemcpyAsync(d_beta, beta, C * sizeof(float), hipMemcpyHostToDevice, stream); hipMemcpyAsync(d_mean, mean, C * sizeof(float), hipMemcpyHostToDevice, stream); hipMemcpyAsync(d_var, var, C * sizeof(float), hipMemcpyHostToDevice, stream); // 启动核函数：一维线程组织 int threads = 256; int blocks = (total + threads - 1) / threads; _BatchNormalization<<>>( N, C, H, W, d_X, d_gamma, d_beta, d_mean, d_var, epsilon, d_Y); // 拷回结果 hipMemcpyAsync(Y, d_Y, total * sizeof(float), hipMemcpyDeviceToHost, stream); hipStreamSynchronize(stream); // 释放设备内存 hipFree(d_X); hipFree(d_Y); hipFree(d_gamma); hipFree(d_beta); hipFree(d_mean); hipFree(d_var); return; } // Cast Operator: float to int32 // Device kernel: cast each element __global__ void _Cast( int total, const float* X, int* Y) { int idx = blockIdx.x * blockDim.x + threadIdx.x; if (idx >= total) return; // cast float to int32 Y[idx] = static_cast(X[idx]); } // Host API: rocm_cast extern "C" void rocm_cast( int64_t N, int64_t C, int64_t H, int64_t W, const float* X, int* Y, hipStream_t stream) { // total elements size_t total = (size_t)N * C * H * W; // allocate device memory float* d_X; int* d_Y; hipMalloc(&d_X, total * sizeof(float)); hipMalloc(&d_Y, total * sizeof(int)); // copy input to device hipMemcpyAsync(d_X, X, total * sizeof(float), hipMemcpyHostToDevice, stream); // launch kernel int threads = 256; int blocks = (total + threads - 1) / threads; _Cast<<>>( total, d_X, d_Y); // copy result back hipMemcpyAsync(Y, d_Y, total * sizeof(int), hipMemcpyDeviceToHost, stream); hipStreamSynchronize(stream); // free device memory hipFree(d_X); hipFree(d_Y); return; } extern "C" __global__ void SoftmaxKernel(const float* X, float* Y, int M, int N) { // M = batch_size, N = feature_size int row = blockIdx.x * blockDim.x + threadIdx.x; if (row >= M) return; const float* x_row = X + row * N; float* y_row = Y + row * N; // 1) 找到这一行的最大值，用于数值稳定性 float m = x_row[0]; for (int j = 1; j < N; ++j) { m = fmaxf(m, x_row[j]); } // 2) 计算 exp(x - m) 并累加 float sum = 0.f; for (int j = 0; j < N; ++j) { float e = expf(x_row[j] - m); y_row[j] = e; sum += e; } // 3) 归一化 for (int j = 0; j < N; ++j) { y_row[j] /= sum; } } // 这个函数由 ONNX Runtime 调用，替代原来的 rocm_add extern "C" void rocm_softmax(int64_t M, int64_t N, const float* X, float* Y, hipStream_t stream) { // 每个线程处理一行，线程块大小 128 const int threads = 128; const int blocks = static_cast((M + threads - 1) / threads); hipLaunchKernelGGL( SoftmaxKernel, dim3(blocks), dim3(threads), 0, // shared mem stream, // hip stream X, Y, static_cast(M), static_cast(N) ); return; } template __global__ void _CeluKernel(const T* X, T* Y, int64_t size, T alpha) { int64_t idx = blockIdx.x * blockDim.x + threadIdx.x; if (idx < size) { T v = X[idx]; T pos = v > T(0) ? v : T(0); T neg = v <= T(0) ? alpha * (exp(v / alpha) - T(1)) : T(0); Y[idx] = pos + neg; } return; } extern "C" void rocm_celu(int64_t size, const float* X, float* Y, float alpha, hipStream_t stream) { float *d_X, *d_Y; hipMalloc(&d_X, size * sizeof(float)); hipMalloc(&d_Y, size * sizeof(float)); hipMemcpyAsync(d_X, X, size * sizeof(float), hipMemcpyHostToDevice, stream); int threads = 256; int blocks = (size + threads - 1) / threads; _CeluKernel<<>>(d_X, d_Y, size, alpha); hipMemcpyAsync(Y, d_Y, size * sizeof(float), hipMemcpyDeviceToHost, stream); hipStreamSynchronize(stream); hipFree(d_X); hipFree(d_Y); return; } //relu template __global__ void _rocm_relu_kernel(float* input, float* output, int64_t size) { int64_t idx = blockIdx.x * blockDim.x + threadIdx.x; if (idx >= size) return; output[idx] = fmaxf(0.0f, input[idx]); } extern "C" void rocm_relu( int64_t size, const float* X, float* Y, hipStream_t stream ) { size_t input_size = size * sizeof(float); float *d_X, *d_Y; hipMalloc(&d_X, input_size); hipMalloc(&d_Y, input_size); hipMemcpyAsync(d_X, X, input_size, hipMemcpyHostToDevice, stream); int threads = 256; int blocks = (size + threads - 1) / threads; _rocm_relu_kernel<<>>(d_X, d_Y, size); hipMemcpyAsync(Y, d_Y, input_size, hipMemcpyDeviceToHost, stream); hipStreamSynchronize(stream); hipFree(d_X); hipFree(d_Y); return; } // ------------------------------- // TopK // ------------------------------- extern "C" __global__ void TopKKernel( const float* __restrict__ X, // [M * N] float* __restrict__ values, // [M * K] int64_t* __restrict__ indices, // [M * K] int M, int N, int K ) { int row = blockIdx.x * blockDim.x + threadIdx.x; if (row >= M) return; const float* x_row = X + size_t(row) * N; float* v_row = values + size_t(row) * K; int64_t* i_row = indices + size_t(row) * K; // 动态共享内存布局：前 K 个 float 存放 topK 值，后 K 个 int 存放对应索引 extern __shared__ char smem[]; float* shared_vals = (float*)smem; int* shared_idx = (int*)(smem + K * sizeof(float)); // 初始化：shared_vals = -INF, shared_idx = -1 for (int t = threadIdx.x; t < K; t += blockDim.x) { shared_vals[t] = -INFINITY; shared_idx[t] = -1; } __syncthreads(); // 扫描整行，维护一个长度为 K 的最小堆逻辑（但这里用简化的线性扫描替代堆） for (int j = 0; j < N; ++j) { float v = x_row[j]; // 找当前最小值位置 float min_val = shared_vals[0]; int min_pos = 0; for (int t = 1; t < K; ++t) { if (shared_vals[t] < min_val) { min_val = shared_vals[t]; min_pos = t; } } // 替换 if (v > min_val) { shared_vals[min_pos] = v; shared_idx[min_pos] = j; } } __syncthreads(); // 对这 K 个元素做简单排序（降序），K 通常比较小 for (int i = 0; i < K; ++i) { for (int j = i + 1; j < K; ++j) { if (shared_vals[j] > shared_vals[i]) { // swap value float tv = shared_vals[i]; shared_vals[i] = shared_vals[j]; shared_vals[j] = tv; // swap idx int ti = shared_idx[i]; shared_idx[i] = shared_idx[j]; shared_idx[j] = ti; } } } __syncthreads(); // 写回全局内存 for (int t = threadIdx.x; t < K; t += blockDim.x) { v_row[t] = shared_vals[t]; i_row[t] = (int64_t)shared_idx[t]; } } extern "C" void rocm_topk( int64_t M, int64_t N, int64_t K, const float* X, float* values, int64_t* indices, hipStream_t stream ) { // 分配设备内存 size_t sizeX = size_t(M) * N * sizeof(float); size_t sizeOutVal = size_t(M) * K * sizeof(float); size_t sizeOutIdx = size_t(M) * K * sizeof(int64_t); float* d_X; float* d_vals; int64_t* d_idx; if (hipMalloc(&d_X, sizeX ) != hipSuccess || hipMalloc(&d_vals,sizeOutVal) != hipSuccess || hipMalloc(&d_idx, sizeOutIdx) != hipSuccess) { fprintf(stderr, "HIP malloc failed in rocm_topk\n"); if (d_X) hipFree(d_X); if (d_vals) hipFree(d_vals); if (d_idx) hipFree(d_idx); return; } // 拷贝输入到设备 hipMemcpyAsync(d_X, X, sizeX, hipMemcpyHostToDevice, stream); // 启动 Kernel：每个线程处理一行，动态共享内存大小 = K*(sizeof(float)+sizeof(int)) dim3 blockDim(128); dim3 gridDim((M + blockDim.x - 1) / blockDim.x); size_t shared_bytes = K * (sizeof(float) + sizeof(int)); hipLaunchKernelGGL( TopKKernel, gridDim, blockDim, shared_bytes, stream, d_X, d_vals, d_idx, int(M), int(N), int(K) ); // 拷贝结果回主机 hipMemcpyAsync(values, d_vals, sizeOutVal, hipMemcpyDeviceToHost, stream); hipMemcpyAsync(indices, d_idx, sizeOutIdx, hipMemcpyDeviceToHost, stream); hipStreamSynchronize(stream); // 释放 hipFree(d_X); hipFree(d_vals); hipFree(d_idx); } //ReduceLogSum template __global__ void _rocm_reduce_log_sum_kernel( const T* input, T* output, int64_t N, int64_t C, int64_t H, int64_t W, int64_t axis, bool keep_dims ) { int idx = blockIdx.x * blockDim.x + threadIdx.x; // 计算总线程数 int total; if (keep_dims) { total = N * C * H * W; } else { switch (axis) { case 0: total = C * H * W; break; case 1: total = N * H * W; break; case 2: total = N * C * W; break; case 3: total = N * C * H; break; default: total = N * C * H * W; break; } } if (idx >= total) return; // 解析输出索引 int out_n = 0, out_c = 0, out_h = 0, out_w = 0; if (keep_dims) { out_n = idx / (C * H * W); int rem = idx % (C * H * W); out_c = rem / (H * W); rem %= H * W; out_h = rem / W; out_w = rem % W; } else { switch (axis) { case 0: { out_n = 0; out_c = idx / (H * W); int rem = idx % (H * W); out_h = rem / W; out_w = rem % W; break; } case 1: { out_n = idx / (H * W); int rem = idx % (H * W); out_c = 0; out_h = rem / W; out_w = rem % W; break; } case 2: { out_n = idx / (C * W); int rem = idx % (C * W); out_c = rem / W; out_w = rem % W; out_h = 0; break; } case 3: { out_n = idx / (C * H); int rem = idx % (C * H); out_c = rem / H; out_h = rem % H; out_w = 0; break; } default: { out_n = idx / (C * H * W); int rem = idx % (C * H * W); out_c = rem / (H * W); rem %= H * W; out_h = rem / W; out_w = rem % W; break; } } } // 计算输入索引范围 int64_t start = 0, end = 0; if (axis == 0) { start = out_c * H * W + out_h * W + out_w; end = N * C * H * W; } else if (axis == 1) { start = out_n * C * H * W + out_h * W + out_w; end = start + C * H * W; } else if (axis == 2) { start = out_n * C * H * W + out_c * H * W + out_w; end = start + H * C * W; } else if (axis == 3) { start = out_n * C * H * W + out_c * H * W + out_h * W; end = start + W * C * H; } // 累加求和 T sum = T(0); int64_t step; switch (axis) { case 0: step = C * H * W; break; case 1: step = H * W; break; case 2: step = W; break; case 3: step = 1; break; default: step = 1; break; } for (int64_t i = start; i < end; i += step) { sum += input[i]; } // 取自然对数 output[idx] = log(sum); } extern "C" void rocm_reduce_log_sum( int64_t N, int64_t C, int64_t H, int64_t W, const float* X, float* Y, int64_t axis, bool keep_dims, hipStream_t stream ) { // 计算输出尺寸 int64_t out_N = keep_dims ? N : (axis == 0 ? 1 : N); int64_t out_C = keep_dims ? C : (axis == 1 ? 1 : C); int64_t out_H = keep_dims ? H : (axis == 2 ? 1 : H); int64_t out_W = keep_dims ? W : (axis == 3 ? 1 : W); size_t input_size = N * C * H * W * sizeof(float); size_t output_size = out_N * out_C * out_H * out_W * sizeof(float); // 设备内存分配 float *d_X, *d_Y; hipMalloc(&d_X, input_size); hipMalloc(&d_Y, output_size); // 异步拷贝数据到设备 hipMemcpyAsync(d_X, X, input_size, hipMemcpyHostToDevice, stream); // 核函数配置 int total_threads = out_N * out_C * out_H * out_W; int block_size = 256; int grid_size = (total_threads + block_size - 1) / block_size; // 启动核函数 _rocm_reduce_log_sum_kernel<<>>( d_X, d_Y, N, C, H, W, axis, keep_dims ); // 异步拷贝结果回主机 hipMemcpyAsync(Y, d_Y, output_size, hipMemcpyDeviceToHost, stream); hipStreamSynchronize(stream); // 释放设备内存 hipFree(d_X); hipFree(d_Y); return; } __global__ void _RoiAlignKernel( const float* X, int N, int C, int H, int W, const float* rois, const int64_t* batch_inds, int num_rois, int out_h, int out_w, int sampling_ratio, float spatial_scale, float* Y) { int rid = blockIdx.x; // ROI index int c = blockIdx.y; // channel int ph = threadIdx.y; // pooled y int pw = threadIdx.x; // pooled x if (rid >= num_rois || c >= C || ph >= out_h || pw >= out_w) return; // 读取 ROI const float* roi_ptr = rois + rid * 4; float x1 = roi_ptr[0] * spatial_scale; float y1 = roi_ptr[1] * spatial_scale; float x2 = roi_ptr[2] * spatial_scale; float y2 = roi_ptr[3] * spatial_scale; int batch_id = static_cast(batch_inds[rid]); float roi_w = max(x2 - x1, 1.0f); float roi_h = max(y2 - y1, 1.0f); float bin_w = roi_w / static_cast(out_w); float bin_h = roi_h / static_cast(out_h); int roi_bin_grid_h = (sampling_ratio > 0) ? sampling_ratio : ceilf(roi_h / out_h); int roi_bin_grid_w = (sampling_ratio > 0) ? sampling_ratio : ceilf(roi_w / out_w); float count = static_cast(roi_bin_grid_h * roi_bin_grid_w); float sum = 0.0f; // 在每个 bin 内做平均 for (int iy = 0; iy < roi_bin_grid_h; ++iy) { float y = y1 + ph * bin_h + (iy + 0.5f) * bin_h / roi_bin_grid_h; for (int ix = 0; ix < roi_bin_grid_w; ++ix) { float x = x1 + pw * bin_w + (ix + 0.5f) * bin_w / roi_bin_grid_w; // 双线性插值 int x0 = floorf(x), x1i = min(x0 + 1, W - 1); int y0 = floorf(y), y1i = min(y0 + 1, H - 1); float lx = x - x0, ly = y - y0; const float* fmap = X + (batch_id * C + c) * H * W; float v00 = fmap[y0 * W + x0]; float v01 = fmap[y0 * W + x1i]; float v10 = fmap[y1i * W + x0]; float v11 = fmap[y1i * W + x1i]; float w00 = (1 - lx) * (1 - ly); float w01 = lx * (1 - ly); float w10 = (1 - lx) * ly; float w11 = lx * ly; sum += v00 * w00 + v01 * w01 + v10 * w10 + v11 * w11; } } float* out_ptr = Y + ((rid * C + c) * out_h + ph) * out_w + pw; *out_ptr = sum / count; } extern "C" void rocm_roi_align( const float* X, int64_t N, int64_t C, int64_t H, int64_t W, const float* rois, const int64_t* batch_inds, int64_t num_rois, int64_t out_h, int64_t out_w, int64_t sampling_ratio, float spatial_scale, float* Y, hipStream_t stream) { dim3 grid(num_rois, C); dim3 block(out_w, out_h); _RoiAlignKernel<<>>( X, N, C, H, W, rois, batch_inds, num_rois, out_h, out_w, sampling_ratio, spatial_scale, Y); hipStreamSynchronize(stream); } // LeakyReLU kernel __global__ void _LeakyReLUKernel(const float* X, float* Y, int64_t size, float alpha) { int64_t idx = blockIdx.x * blockDim.x + threadIdx.x; if (idx >= size) return; float v = X[idx]; Y[idx] = (v >= 0.0f) ? v : alpha * v; } extern "C" void rocm_leaky_relu( int64_t size, const float* d_X, float* d_Y, float alpha, hipStream_t stream) { const int threads = 256; int blocks = static_cast((size + threads - 1) / threads); _LeakyReLUKernel<<>>(d_X, d_Y, size, alpha); } //Conv __global__ void _Conv2dKernel(const float* input, const float* weight, const float* bias, float* output, int N, int C_in, int H, int W, int C_out, int K_h, int K_w, int out_H, int out_W) { int n = blockIdx.x; int oc = blockIdx.y; int oh = threadIdx.y; int ow = threadIdx.x; if (oh >= out_H || ow >= out_W) return; float sum = bias[oc]; for (int ic = 0; ic < C_in; ++ic) { for (int kh = 0; kh < K_h; ++kh) { for (int kw = 0; kw < K_w; ++kw) { int ih = oh + kh; int iw = ow + kw; float val = input[n * (C_in * H * W) + ic * (H * W) + ih * W + iw]; float w = weight[oc * (C_in * K_h * K_w) + ic * (K_h * K_w) + kh * K_w + kw]; sum += val * w; } } } output[n * (C_out * out_H * out_W) + oc * (out_H * out_W) + oh * out_W + ow] = sum; } extern "C" void rocm_conv2d(const float* input, const float* weight, const float* bias, float* output, int N, int C_in, int H, int W, int C_out, int K_h, int K_w, int out_H, int out_W, hipStream_t stream) { dim3 blocks(N, C_out); dim3 threads(out_W, out_H); _Conv2dKernel<<>>( input, weight, bias, output, N, C_in, H, W, C_out, K_h, K_w, out_H, out_W); hipError_t err = hipGetLastError(); if (err != hipSuccess) { fprintf(stderr, "Conv2D kernel launch failed: %s\n", hipGetErrorString(err)); } }