Commit 8ac49790 authored by one's avatar one
Browse files

Update GEMV benchmarks, move to a separate dir

parent b3a56179
HIPCC ?= hipcc
CXXFLAGS ?= -std=c++17 -O3
OFFLOAD_ARCH ?= gfx936
TARGET := gemv_bench
SRC := gemv_bf16.cpp
DEP := gemv_utils.h
.PHONY: all clean
all: $(TARGET)
$(TARGET): $(SRC) $(DEP)
$(HIPCC) $(CXXFLAGS) --offload-arch=$(OFFLOAD_ARCH) $< -o $@
clean:
rm -f $(TARGET)
This diff is collapsed.
CXX ?= hipcc
CXX_FLAGS ?= -std=c++17 -O3
GPU_ARCH ?= gfx936
TARGET := gemv_bench
SRC := main.cpp
DEP := gemv_bf16.h gemv_utils.h hip_compat.h
.PHONY: all clean
all: $(TARGET)
# 根据 CXX 变量判断编译器类型
ifneq (,$(findstring nvcc,$(CXX)))
# NVCC 编译
$(TARGET): $(SRC) $(DEP)
$(CXX) $(CXX_FLAGS) -arch=$(GPU_ARCH) -x cu $< -o $@
else
# HIPCC 编译
$(TARGET): $(SRC) $(DEP)
$(CXX) $(CXX_FLAGS) --offload-arch=$(GPU_ARCH) $< -o $@
endif
clean:
rm -f $(TARGET)
GEMV Benchmarks
---------------
模仿 GEMM 接口的 GEMV,即 N=1,实现 BF16 版本。这些矩阵形状来自于 Evo2 推理过程。
计算公式:y = alpha * A^T * x + beta * y
M: 输出维度,例如 11264
K: 归约维度,例如 4096
N: 始终为 1
beta: 始终为 0
## Build
```bash
# 使用 HIPCC:
CXX=hipcc make GPU_ARCH=gfx936
# 使用 NVCC:
CXX=nvcc make GPU_ARCH=sm_80
```
## Run
```bash
# BW系列:
HIP_VISIBLE_DEVICES=1 numactl -N 0 -m 0 ./gemv_bench -M 11264 -K 4096
# A800:
./gemv_bench -M 11264 -K 4096
```
\ No newline at end of file
This diff is collapsed.
#pragma once #pragma once
#include "hip_compat.h"
#include <algorithm> #include <algorithm>
#include <functional> #include <functional>
#include <hip/hip_bfloat16.h>
#include <hip/hip_runtime.h>
#include <iomanip> #include <iomanip>
#include <iostream> #include <iostream>
#include <string> #include <string>
#include <vector> #include <vector>
// -------------------------------------------------------------------------------- // ============================================================================
// Error Handling // Error Handling
// -------------------------------------------------------------------------------- // ============================================================================
inline void checkHipErrors(hipError_t result) { inline void checkHipErrors(hipError_t result) {
if (result != hipSuccess) { if (result != hipSuccess) {
...@@ -20,9 +19,21 @@ inline void checkHipErrors(hipError_t result) { ...@@ -20,9 +19,21 @@ inline void checkHipErrors(hipError_t result) {
} }
} }
// -------------------------------------------------------------------------------- // ============================================================================
// Device Info
// ============================================================================
/// L2 cache size in MB
inline int get_l2_cache_size(int device = 0) {
hipDeviceProp_t prop;
checkHipErrors(hipGetDeviceProperties(&prop, device));
return prop.l2CacheSize / 1024 / 1024;
}
// ============================================================================
// Command Line Parsing // Command Line Parsing
// -------------------------------------------------------------------------------- // ============================================================================
inline char *getCmdOption(char **begin, char **end, const std::string &option) { inline char *getCmdOption(char **begin, char **end, const std::string &option) {
char **itr = std::find(begin, end, option); char **itr = std::find(begin, end, option);
...@@ -32,9 +43,9 @@ inline char *getCmdOption(char **begin, char **end, const std::string &option) { ...@@ -32,9 +43,9 @@ inline char *getCmdOption(char **begin, char **end, const std::string &option) {
return 0; return 0;
} }
// -------------------------------------------------------------------------------- // ============================================================================
// CPU Reference & Verification // CPU Reference & Verification
// -------------------------------------------------------------------------------- // ============================================================================
inline void gemv_cpu(int M, int K, float alpha, const hip_bfloat16 *h_A, inline void gemv_cpu(int M, int K, float alpha, const hip_bfloat16 *h_A,
int lda, const hip_bfloat16 *h_x, float beta, int lda, const hip_bfloat16 *h_x, float beta,
...@@ -46,7 +57,7 @@ inline void gemv_cpu(int M, int K, float alpha, const hip_bfloat16 *h_A, ...@@ -46,7 +57,7 @@ inline void gemv_cpu(int M, int K, float alpha, const hip_bfloat16 *h_A,
float val_x = static_cast<float>(h_x[k]); float val_x = static_cast<float>(h_x[k]);
sum += val_a * val_x; sum += val_a * val_x;
} }
h_y[m] = hip_bfloat16(alpha * sum + beta * h_y[m]); h_y[m] = hip_bfloat16(alpha * sum + beta * static_cast<float>(h_y[m]));
} }
return; return;
...@@ -81,9 +92,9 @@ inline bool verify_result(int M, const hip_bfloat16 *h_y_gpu, ...@@ -81,9 +92,9 @@ inline bool verify_result(int M, const hip_bfloat16 *h_y_gpu,
return true; return true;
} }
// -------------------------------------------------------------------------------- // ============================================================================
// Benchmark Framework // Benchmark Framework
// -------------------------------------------------------------------------------- // ============================================================================
// 定义统一的 Kernel Launcher 签名 // 定义统一的 Kernel Launcher 签名
using KernelLauncher = std::function<void( using KernelLauncher = std::function<void(
...@@ -131,6 +142,8 @@ inline void run_benchmark(const std::vector<KernelCase> &cases, int M, int K, ...@@ -131,6 +142,8 @@ inline void run_benchmark(const std::vector<KernelCase> &cases, int M, int K,
printf("%s\n", std::string(w_table, '-').c_str()); printf("%s\n", std::string(w_table, '-').c_str());
printf("M=%d, K=%d, N=1\n", M, K); printf("M=%d, K=%d, N=1\n", M, K);
printf("lda=%d\n", lda); printf("lda=%d\n", lda);
printf("sizeof(A)=%lu MB\n", M * lda * sizeof(hip_bfloat16) / 1024 / 1024);
printf("L2 cache=%d MB\n", get_l2_cache_size());
printf("%s\n", std::string(w_table, '-').c_str()); printf("%s\n", std::string(w_table, '-').c_str());
printf("%-38s %10s %10s %10s %8s\n", "Kernel Name", "Time (us)", "GFLOPS", printf("%-38s %10s %10s %10s %8s\n", "Kernel Name", "Time (us)", "GFLOPS",
"BW (GB/s)", "Result"); "BW (GB/s)", "Result");
...@@ -165,7 +178,7 @@ inline void run_benchmark(const std::vector<KernelCase> &cases, int M, int K, ...@@ -165,7 +178,7 @@ inline void run_benchmark(const std::vector<KernelCase> &cases, int M, int K,
checkHipErrors(hipDeviceSynchronize()); checkHipErrors(hipDeviceSynchronize());
// 3. Timing // 3. Timing
int num_runs = 1000; int num_runs = 100;
checkHipErrors(hipEventRecord(start)); checkHipErrors(hipEventRecord(start));
for (int i = 0; i < num_runs; ++i) { for (int i = 0; i < num_runs; ++i) {
k.func(M, K, alpha, A, lda, x, beta, y); k.func(M, K, alpha, A, lda, x, beta, y);
......
#pragma once
/**
* HIP 到 CUDA 的兼容层
*
* 使用 nvcc 编译时,自动将 HIP API 映射到 CUDA API
* 使用 hipcc 编译时,使用原生 HIP 头文件
*/
#if defined(__NVCC__) || defined(__CUDACC__)
#include <cuda_runtime.h>
#include <cuda_bf16.h>
#include <stdio.h>
// Runtime API 映射
#define hipMalloc cudaMalloc
#define hipFree cudaFree
#define hipMemcpy cudaMemcpy
#define hipMemcpyHostToDevice cudaMemcpyHostToDevice
#define hipMemcpyDeviceToHost cudaMemcpyDeviceToHost
#define hipMemset cudaMemset
#define hipDeviceSynchronize cudaDeviceSynchronize
#define hipGetDeviceProperties cudaGetDeviceProperties
#define hipGetErrorString cudaGetErrorString
// Event API 映射
#define hipEvent_t cudaEvent_t
#define hipEventCreate cudaEventCreate
#define hipEventDestroy cudaEventDestroy
#define hipEventRecord cudaEventRecord
#define hipEventSynchronize cudaEventSynchronize
#define hipEventElapsedTime cudaEventElapsedTime
// 数据类型映射
#define hipDeviceProp_t cudaDeviceProp
#define hipError_t cudaError_t
#define hipSuccess cudaSuccess
// CUDA 使用 __nv_bfloat16,HIP 使用 hip_bfloat16
typedef __nv_bfloat16 hip_bfloat16;
// Shuffle 指令映射
// CUDA 9.0+ 需要使用带 _sync 后缀的版本,并传入 warp mask
// 0xffffffff 表示整个 warp 的所有线程都参与
#ifndef __shfl_down
#define __shfl_down(val, offset) __shfl_down_sync(0xffffffff, val, offset)
#endif
#else
#include <hip/hip_runtime.h>
#include <hip/hip_bfloat16.h>
#endif
#include "gemv_bf16.h"
int main(int argc, char **argv) {
bool do_verify = false;
float alpha = 1.0f;
float beta = 0.0f;
int M = 11264;
int K = 4096;
// int N = 1; // Unused
int lda = K;
int block_size = 256;
if (char *value = getCmdOption(argv, argv + argc, "--verify")) {
do_verify = std::stoi(value) == 1;
}
if (char *value = getCmdOption(argv, argv + argc, "--alpha")) {
alpha = std::stof(value);
}
if (char *value = getCmdOption(argv, argv + argc, "-M")) {
M = std::stoi(value);
}
if (char *value = getCmdOption(argv, argv + argc, "-K")) {
K = std::stoi(value);
lda = K;
}
if (char *value = getCmdOption(argv, argv + argc, "--lda")) {
lda = std::stoi(value);
}
if (char *value = getCmdOption(argv, argv + argc, "-B")) {
block_size = std::stoi(value);
}
// transA=T,因此是行优先
size_t count_A = (size_t)M * lda;
size_t size_A = count_A * sizeof(hip_bfloat16);
size_t size_x = (size_t)K * sizeof(hip_bfloat16);
size_t size_y = (size_t)M * sizeof(hip_bfloat16);
// Host 内存分配
std::vector<hip_bfloat16> h_A(count_A);
std::vector<hip_bfloat16> h_x(K);
std::vector<hip_bfloat16> h_y(M);
// 随机初始数据
const float rand_max = static_cast<float>(RAND_MAX);
for (int i = 0; i < count_A; i++)
h_A[i] = hip_bfloat16(static_cast<float>(rand()) / rand_max);
for (int i = 0; i < K; i++)
h_x[i] = hip_bfloat16(static_cast<float>(rand()) / rand_max);
for (int i = 0; i < M; i++)
h_y[i] = hip_bfloat16(0.0f);
// Device 内存分配
hip_bfloat16 *d_A, *d_x, *d_y;
checkHipErrors(hipMalloc(&d_A, size_A));
checkHipErrors(hipMalloc(&d_x, size_x));
checkHipErrors(hipMalloc(&d_y, size_y));
checkHipErrors(hipMemcpy(d_A, h_A.data(), size_A, hipMemcpyHostToDevice));
checkHipErrors(hipMemcpy(d_x, h_x.data(), size_x, hipMemcpyHostToDevice));
checkHipErrors(hipMemcpy(d_y, h_y.data(), size_y, hipMemcpyHostToDevice));
// Kernel 注册表
std::vector<KernelCase> kernels;
constexpr bool NTL = true;
constexpr int UNROLL = 4;
constexpr int TILE_K = calculate_tile_k<8>(4);
constexpr int ROWS_PER_WARP = 2;
kernels.push_back(
{"naive", [&](int M, int K, float alpha, const hip_bfloat16 *A, int lda,
const hip_bfloat16 *x, float beta, hip_bfloat16 *y) {
int grid = (M + block_size - 1) / block_size;
gemv_bf16_TN_naive<<<grid, block_size>>>(M, K, alpha, A, lda, x, beta,
y);
}});
kernels.push_back(
{"vec8", [&](int M, int K, float alpha, const hip_bfloat16 *A, int lda,
const hip_bfloat16 *x, float beta, hip_bfloat16 *y) {
int grid = (M + block_size - 1) / block_size;
gemv_bf16_TN_vec<<<grid, block_size>>>(M, K, alpha, A, lda, x, beta,
y);
}});
kernels.push_back(
{"vec8_ntl",
[&](int M, int K, float alpha, const hip_bfloat16 *A, int lda,
const hip_bfloat16 *x, float beta, hip_bfloat16 *y) {
int grid = (M + block_size - 1) / block_size;
gemv_bf16_TN_vec<NTL>
<<<grid, block_size>>>(M, K, alpha, A, lda, x, beta, y);
}});
kernels.push_back(
{"warp", [&](int M, int K, float alpha, const hip_bfloat16 *A, int lda,
const hip_bfloat16 *x, float beta, hip_bfloat16 *y) {
int warps_per_block = block_size / WARP_SIZE;
int grid = (M + warps_per_block - 1) / warps_per_block;
gemv_bf16_TN_warp<<<grid, block_size>>>(M, K, alpha, A, lda, x, beta,
y);
}});
kernels.push_back(
{"vec8+warp",
[&](int M, int K, float alpha, const hip_bfloat16 *A, int lda,
const hip_bfloat16 *x, float beta, hip_bfloat16 *y) {
int warps_per_block = block_size / WARP_SIZE;
int grid = (M + warps_per_block - 1) / warps_per_block;
gemv_bf16_TN_vec_warp<<<grid, block_size>>>(M, K, alpha, A, lda, x,
beta, y);
}});
kernels.push_back(
{"vec8_ntl+warp",
[&](int M, int K, float alpha, const hip_bfloat16 *A, int lda,
const hip_bfloat16 *x, float beta, hip_bfloat16 *y) {
int warps_per_block = block_size / WARP_SIZE;
int grid = (M + warps_per_block - 1) / warps_per_block;
gemv_bf16_TN_vec_warp<NTL>
<<<grid, block_size>>>(M, K, alpha, A, lda, x, beta, y);
}});
kernels.push_back(
{"vec8+warp_mr" + std::to_string(ROWS_PER_WARP),
[&](int M, int K, float alpha, const hip_bfloat16 *A, int lda,
const hip_bfloat16 *x, float beta, hip_bfloat16 *y) {
int warps_per_block = block_size / WARP_SIZE;
int grid =
((M + ROWS_PER_WARP - 1) / ROWS_PER_WARP + warps_per_block - 1) /
warps_per_block;
gemv_bf16_TN_vec_warp_mr<!NTL, ROWS_PER_WARP>
<<<grid, block_size>>>(M, K, alpha, A, lda, x, beta, y);
}});
kernels.push_back(
{"vec8_ntl+warp_mr" + std::to_string(ROWS_PER_WARP),
[&](int M, int K, float alpha, const hip_bfloat16 *A, int lda,
const hip_bfloat16 *x, float beta, hip_bfloat16 *y) {
int warps_per_block = block_size / WARP_SIZE;
int grid =
((M + ROWS_PER_WARP - 1) / ROWS_PER_WARP + warps_per_block - 1) /
warps_per_block;
gemv_bf16_TN_vec_warp_mr<NTL, ROWS_PER_WARP>
<<<grid, block_size>>>(M, K, alpha, A, lda, x, beta, y);
}});
kernels.push_back(
{"vec8+warp+unroll" + std::to_string(UNROLL),
[&](int M, int K, float alpha, const hip_bfloat16 *A, int lda,
const hip_bfloat16 *x, float beta, hip_bfloat16 *y) {
int warps_per_block = block_size / WARP_SIZE;
int grid = (M + warps_per_block - 1) / warps_per_block;
gemv_bf16_TN_vec_warp_unroll<!NTL, UNROLL>
<<<grid, block_size>>>(M, K, alpha, A, lda, x, beta, y);
}});
kernels.push_back(
{"vec8_ntl+warp+unroll" + std::to_string(UNROLL),
[&](int M, int K, float alpha, const hip_bfloat16 *A, int lda,
const hip_bfloat16 *x, float beta, hip_bfloat16 *y) {
int warps_per_block = block_size / WARP_SIZE;
int grid = (M + warps_per_block - 1) / warps_per_block;
gemv_bf16_TN_vec_warp_unroll<NTL, UNROLL>
<<<grid, block_size>>>(M, K, alpha, A, lda, x, beta, y);
}});
kernels.push_back(
{"vec8+warp+shm" + std::to_string(TILE_K),
[&](int M, int K, float alpha, const hip_bfloat16 *A, int lda,
const hip_bfloat16 *x, float beta, hip_bfloat16 *y) {
int warps_per_block = block_size / WARP_SIZE;
int grid = (M + warps_per_block - 1) / warps_per_block;
gemv_bf16_TN_vec_warp_shm<!NTL, TILE_K>
<<<grid, block_size>>>(M, K, alpha, A, lda, x, beta, y);
}});
kernels.push_back(
{"vec8_ntl+warp+shm" + std::to_string(TILE_K),
[&](int M, int K, float alpha, const hip_bfloat16 *A, int lda,
const hip_bfloat16 *x, float beta, hip_bfloat16 *y) {
int warps_per_block = block_size / WARP_SIZE;
int grid = (M + warps_per_block - 1) / warps_per_block;
gemv_bf16_TN_vec_warp_shm<NTL, TILE_K>
<<<grid, block_size>>>(M, K, alpha, A, lda, x, beta, y);
}});
kernels.push_back(
{"vec8+warp+unroll" + std::to_string(UNROLL) + "+shm" +
std::to_string(TILE_K),
[&](int M, int K, float alpha, const hip_bfloat16 *A, int lda,
const hip_bfloat16 *x, float beta, hip_bfloat16 *y) {
int warps_per_block = block_size / WARP_SIZE;
int grid = (M + warps_per_block - 1) / warps_per_block;
gemv_bf16_TN_vec_warp_unroll_shm<!NTL, UNROLL, TILE_K>
<<<grid, block_size>>>(M, K, alpha, A, lda, x, beta, y);
}});
kernels.push_back(
{"vec8_ntl+warp+unroll" + std::to_string(UNROLL) + "+shm" +
std::to_string(TILE_K),
[&](int M, int K, float alpha, const hip_bfloat16 *A, int lda,
const hip_bfloat16 *x, float beta, hip_bfloat16 *y) {
int warps_per_block = block_size / WARP_SIZE;
int grid = (M + warps_per_block - 1) / warps_per_block;
gemv_bf16_TN_vec_warp_unroll_shm<NTL, UNROLL, TILE_K>
<<<grid, block_size>>>(M, K, alpha, A, lda, x, beta, y);
}});
kernels.push_back(
{"vec8+warp_mr" + std::to_string(ROWS_PER_WARP) + "+shm" +
std::to_string(TILE_K),
[&](int M, int K, float alpha, const hip_bfloat16 *A, int lda,
const hip_bfloat16 *x, float beta, hip_bfloat16 *y) {
int warps_per_block = block_size / WARP_SIZE;
int grid =
((M + ROWS_PER_WARP - 1) / ROWS_PER_WARP + warps_per_block - 1) /
warps_per_block;
gemv_bf16_TN_vec_warp_mr_shm<!NTL, TILE_K, ROWS_PER_WARP>
<<<grid, block_size>>>(M, K, alpha, A, lda, x, beta, y);
}});
kernels.push_back(
{"vec8_ntl+warp_mr" + std::to_string(ROWS_PER_WARP) + "+shm" +
std::to_string(TILE_K),
[&](int M, int K, float alpha, const hip_bfloat16 *A, int lda,
const hip_bfloat16 *x, float beta, hip_bfloat16 *y) {
int warps_per_block = block_size / WARP_SIZE;
int grid =
((M + ROWS_PER_WARP - 1) / ROWS_PER_WARP + warps_per_block - 1) /
warps_per_block;
gemv_bf16_TN_vec_warp_mr_shm<NTL, TILE_K, ROWS_PER_WARP>
<<<grid, block_size>>>(M, K, alpha, A, lda, x, beta, y);
}});
// 运行所有测试
run_benchmark(kernels, M, K, alpha, d_A, lda, d_x, beta, d_y, do_verify);
// 清理
checkHipErrors(hipFree(d_A));
checkHipErrors(hipFree(d_x));
checkHipErrors(hipFree(d_y));
return 0;
}
\ No newline at end of file
#!/bin/bash #!/bin/bash
set -e
# BW150 # BW150
export HIP_VISIBLE_DEVICES=1 export HIP_VISIBLE_DEVICES=1
BIND_CMD="numactl -N 0 -m 0" BIND_CMD="numactl -N 0 -m 0"
make CXX=hipcc make
if [[ "$*" == *"--pmc"* ]]; then
if [[ "$*" == *"--trace"* ]]; then PROF_CMD="hipprof --trace-off --pmc --pmc-type 3"
PROF_CMD="hipprof --trace-off --pmc"
${PROF_CMD} -o log/pmc-k1 ${BIND_CMD} ./gemv_bench --verify 1 -M 11264 -K 4096 ${PROF_CMD} -o log/pmc-k1 ${BIND_CMD} ./gemv_bench --verify 1 -M 11264 -K 4096
${PROF_CMD} -o log/pmc-k2 ${BIND_CMD} ./gemv_bench --verify 1 -M 4096 -K 11264 ${PROF_CMD} -o log/pmc-k2 ${BIND_CMD} ./gemv_bench --verify 1 -M 4096 -K 11264
${PROF_CMD} -o log/pmc-k3 ${BIND_CMD} ./gemv_bench --verify 1 -M 12288 -K 4096 ${PROF_CMD} -o log/pmc-k3 ${BIND_CMD} ./gemv_bench --verify 1 -M 12288 -K 4096
${PROF_CMD} -o log/pmc-k4 ${BIND_CMD} ./gemv_bench --verify 1 -M 4096 -K 4096 ${PROF_CMD} -o log/pmc-k4 ${BIND_CMD} ./gemv_bench --verify 1 -M 4096 -K 4096
elif [[ "$*" == *"--trace"* ]]; then
PROF_CMD="hipprof --hip-trace"
${PROF_CMD} -o log/trace-k1 ${BIND_CMD} ./gemv_bench --verify 1 -M 11264 -K 4096
${PROF_CMD} -o log/trace-k2 ${BIND_CMD} ./gemv_bench --verify 1 -M 4096 -K 11264
${PROF_CMD} -o log/trace-k3 ${BIND_CMD} ./gemv_bench --verify 1 -M 12288 -K 4096
${PROF_CMD} -o log/trace-k4 ${BIND_CMD} ./gemv_bench --verify 1 -M 4096 -K 4096
else else
${BIND_CMD} ./gemv_bench --verify 1 -M 11264 -K 4096 ${BIND_CMD} ./gemv_bench --verify 1 -M 11264 -K 4096
${BIND_CMD} ./gemv_bench --verify 1 -M 4096 -K 11264 ${BIND_CMD} ./gemv_bench --verify 1 -M 4096 -K 11264
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment