Commit a781cad3 authored by one's avatar one
Browse files

Add warmups and loops params to GEMV benchmarks

parent 845b2d24
GPU_ARCH ?= gfx936
CXX ?= hipcc
CXX_FLAGS ?= -std=c++17 -O3
GPU_ARCH ?= gfx936
HIPCC_FLAGS = --offload-arch=$(GPU_ARCH)
NVCC_FLAGS = -arch=$(GPU_ARCH) -x cu
TARGET := gemv_bench
SRC := main.cpp
......@@ -11,15 +13,14 @@ DEP := gemv_bf16.h gemv_utils.h hip_compat.h
all: $(TARGET)
# 根据 CXX 变量判断编译器类型
ifneq (,$(findstring nvcc,$(CXX)))
# NVCC 编译
ifneq (,$(findstring hipcc,$(CXX)))
# HIPCC 编译
$(TARGET): $(SRC) $(DEP)
$(CXX) $(CXX_FLAGS) -arch=$(GPU_ARCH) -x cu $< -o $@
$(CXX) $(CXX_FLAGS) $(HIPCC_FLAGS) $< -o $@
else
# HIPCC 编译
# NVCC 编译
$(TARGET): $(SRC) $(DEP)
$(CXX) $(CXX_FLAGS) --offload-arch=$(GPU_ARCH) $< -o $@
$(CXX) $(CXX_FLAGS) $(NVCC_FLAGS) $< -o $@
endif
clean:
......
......@@ -106,7 +106,8 @@ struct KernelCase {
KernelLauncher func;
};
inline void run_benchmark(const std::vector<KernelCase> &cases, int M, int K,
inline void run_benchmark(int warmups, int loops,
const std::vector<KernelCase> &cases, int M, int K,
float alpha, const hip_bfloat16 *A, int lda,
const hip_bfloat16 *x, float beta, hip_bfloat16 *y,
bool do_verify) {
......@@ -144,6 +145,7 @@ inline void run_benchmark(const std::vector<KernelCase> &cases, int M, int K,
printf("lda=%d\n", lda);
printf("sizeof(A)=%lu MB\n", M * lda * sizeof(hip_bfloat16) / 1024 / 1024);
printf("L2 cache=%d MB\n", get_l2_cache_size());
printf("Warmups=%d, Loops=%d\n", warmups, loops);
printf("%s\n", std::string(w_table, '-').c_str());
printf("%-38s %10s %10s %10s %8s\n", "Kernel Name", "Time (us)", "GFLOPS",
"BW (GB/s)", "Result");
......@@ -172,13 +174,13 @@ inline void run_benchmark(const std::vector<KernelCase> &cases, int M, int K,
}
// 2. Warmup
for (int i = 0; i < 100; ++i) {
for (int i = 0; i < warmups; ++i) {
k.func(M, K, alpha, A, lda, x, beta, y);
}
checkHipErrors(hipDeviceSynchronize());
// 3. Timing
int num_runs = 100;
int num_runs = loops;
checkHipErrors(hipEventRecord(start));
for (int i = 0; i < num_runs; ++i) {
k.func(M, K, alpha, A, lda, x, beta, y);
......
#include "gemv_bf16.h"
int main(int argc, char **argv) {
int warmups = 100;
int loops = 2000;
bool do_verify = false;
float alpha = 1.0f;
float beta = 0.0f;
......@@ -10,6 +12,14 @@ int main(int argc, char **argv) {
int lda = K;
int block_size = 256;
if (char *value = getCmdOption(argv, argv + argc, "--warmups")) {
warmups = std::stoi(value);
}
if (char *value = getCmdOption(argv, argv + argc, "--loops")) {
loops = std::stoi(value);
}
if (char *value = getCmdOption(argv, argv + argc, "--verify")) {
do_verify = std::stoi(value) == 1;
}
......@@ -244,7 +254,8 @@ int main(int argc, char **argv) {
}});
// 运行所有测试
run_benchmark(kernels, M, K, alpha, d_A, lda, d_x, beta, d_y, do_verify);
run_benchmark(warmups, loops, kernels, M, K, alpha, d_A, lda, d_x, beta, d_y,
do_verify);
// 清理
checkHipErrors(hipFree(d_A));
......
......@@ -4,23 +4,24 @@ set -e
export HIP_VISIBLE_DEVICES=1
BIND_CMD="numactl -N 0 -m 0"
CXX=hipcc make
make clean
CXX=hipcc make GPU_ARCH=gfx936
# CXX=nvcc make GPU_ARCH=sm_80
if [[ "$*" == *"--pmc"* ]]; then
PROF_CMD="hipprof --trace-off --pmc --pmc-type 3"
${PROF_CMD} -o log/pmc-k1 ${BIND_CMD} ./gemv_bench --verify 1 -M 11264 -K 4096
${PROF_CMD} -o log/pmc-k2 ${BIND_CMD} ./gemv_bench --verify 1 -M 4096 -K 11264
${PROF_CMD} -o log/pmc-k3 ${BIND_CMD} ./gemv_bench --verify 1 -M 12288 -K 4096
${PROF_CMD} -o log/pmc-k4 ${BIND_CMD} ./gemv_bench --verify 1 -M 4096 -K 4096
PROF_CMD="hipprof --trace-off --pmc"
${PROF_CMD} -o log/pmc-w1 ${BIND_CMD} ./gemv_bench --warmups 10 --loops 20 --verify 1 -M 11264 -K 4096
${PROF_CMD} -o log/pmc-w2 ${BIND_CMD} ./gemv_bench --warmups 10 --loops 20 --verify 1 -M 4096 -K 11264
${PROF_CMD} -o log/pmc-w3 ${BIND_CMD} ./gemv_bench --warmups 10 --loops 20 --verify 1 -M 12288 -K 4096
${PROF_CMD} -o log/pmc-w4 ${BIND_CMD} ./gemv_bench --warmups 10 --loops 20 --verify 1 -M 4096 -K 4096
elif [[ "$*" == *"--trace"* ]]; then
PROF_CMD="hipprof --hip-trace"
${PROF_CMD} -o log/trace-k1 ${BIND_CMD} ./gemv_bench --verify 1 -M 11264 -K 4096
${PROF_CMD} -o log/trace-k2 ${BIND_CMD} ./gemv_bench --verify 1 -M 4096 -K 11264
${PROF_CMD} -o log/trace-k3 ${BIND_CMD} ./gemv_bench --verify 1 -M 12288 -K 4096
${PROF_CMD} -o log/trace-k4 ${BIND_CMD} ./gemv_bench --verify 1 -M 4096 -K 4096
${PROF_CMD} -o log/trace-w1 ${BIND_CMD} ./gemv_bench --warmups 100 --loops 1000 --verify 1 -M 11264 -K 4096
${PROF_CMD} -o log/trace-w2 ${BIND_CMD} ./gemv_bench --warmups 100 --loops 1000 --verify 1 -M 4096 -K 11264
${PROF_CMD} -o log/trace-w3 ${BIND_CMD} ./gemv_bench --warmups 100 --loops 1000 --verify 1 -M 12288 -K 4096
${PROF_CMD} -o log/trace-w4 ${BIND_CMD} ./gemv_bench --warmups 100 --loops 1000 --verify 1 -M 4096 -K 4096
else
${BIND_CMD} ./gemv_bench --verify 1 -M 11264 -K 4096
${BIND_CMD} ./gemv_bench --verify 1 -M 4096 -K 11264
${BIND_CMD} ./gemv_bench --verify 1 -M 12288 -K 4096
${BIND_CMD} ./gemv_bench --verify 1 -M 4096 -K 4096
${BIND_CMD} ./gemv_bench --warmups 100 --loops 2000 --verify 1 -M 11264 -K 4096
${BIND_CMD} ./gemv_bench --warmups 100 --loops 2000 --verify 1 -M 4096 -K 11264
${BIND_CMD} ./gemv_bench --warmups 100 --loops 2000 --verify 1 -M 12288 -K 4096
${BIND_CMD} ./gemv_bench --warmups 100 --loops 2000 --verify 1 -M 4096 -K 4096
fi
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment