Commit a781cad3 authored by one's avatar one
Browse files

Add warmups and loops params to GEMV benchmarks

parent 845b2d24
GPU_ARCH ?= gfx936
CXX ?= hipcc CXX ?= hipcc
CXX_FLAGS ?= -std=c++17 -O3 CXX_FLAGS ?= -std=c++17 -O3
GPU_ARCH ?= gfx936 HIPCC_FLAGS = --offload-arch=$(GPU_ARCH)
NVCC_FLAGS = -arch=$(GPU_ARCH) -x cu
TARGET := gemv_bench TARGET := gemv_bench
SRC := main.cpp SRC := main.cpp
...@@ -11,15 +13,14 @@ DEP := gemv_bf16.h gemv_utils.h hip_compat.h ...@@ -11,15 +13,14 @@ DEP := gemv_bf16.h gemv_utils.h hip_compat.h
all: $(TARGET) all: $(TARGET)
# 根据 CXX 变量判断编译器类型 # 根据 CXX 变量判断编译器类型
ifneq (,$(findstring nvcc,$(CXX))) ifneq (,$(findstring hipcc,$(CXX)))
# HIPCC 编译
# NVCC 编译
$(TARGET): $(SRC) $(DEP) $(TARGET): $(SRC) $(DEP)
$(CXX) $(CXX_FLAGS) -arch=$(GPU_ARCH) -x cu $< -o $@ $(CXX) $(CXX_FLAGS) $(HIPCC_FLAGS) $< -o $@
else else
# HIPCC 编译 # NVCC 编译
$(TARGET): $(SRC) $(DEP) $(TARGET): $(SRC) $(DEP)
$(CXX) $(CXX_FLAGS) --offload-arch=$(GPU_ARCH) $< -o $@ $(CXX) $(CXX_FLAGS) $(NVCC_FLAGS) $< -o $@
endif endif
clean: clean:
......
...@@ -106,7 +106,8 @@ struct KernelCase { ...@@ -106,7 +106,8 @@ struct KernelCase {
KernelLauncher func; KernelLauncher func;
}; };
inline void run_benchmark(const std::vector<KernelCase> &cases, int M, int K, inline void run_benchmark(int warmups, int loops,
const std::vector<KernelCase> &cases, int M, int K,
float alpha, const hip_bfloat16 *A, int lda, float alpha, const hip_bfloat16 *A, int lda,
const hip_bfloat16 *x, float beta, hip_bfloat16 *y, const hip_bfloat16 *x, float beta, hip_bfloat16 *y,
bool do_verify) { bool do_verify) {
...@@ -144,6 +145,7 @@ inline void run_benchmark(const std::vector<KernelCase> &cases, int M, int K, ...@@ -144,6 +145,7 @@ inline void run_benchmark(const std::vector<KernelCase> &cases, int M, int K,
printf("lda=%d\n", lda); printf("lda=%d\n", lda);
printf("sizeof(A)=%lu MB\n", M * lda * sizeof(hip_bfloat16) / 1024 / 1024); printf("sizeof(A)=%lu MB\n", M * lda * sizeof(hip_bfloat16) / 1024 / 1024);
printf("L2 cache=%d MB\n", get_l2_cache_size()); printf("L2 cache=%d MB\n", get_l2_cache_size());
printf("Warmups=%d, Loops=%d\n", warmups, loops);
printf("%s\n", std::string(w_table, '-').c_str()); printf("%s\n", std::string(w_table, '-').c_str());
printf("%-38s %10s %10s %10s %8s\n", "Kernel Name", "Time (us)", "GFLOPS", printf("%-38s %10s %10s %10s %8s\n", "Kernel Name", "Time (us)", "GFLOPS",
"BW (GB/s)", "Result"); "BW (GB/s)", "Result");
...@@ -172,13 +174,13 @@ inline void run_benchmark(const std::vector<KernelCase> &cases, int M, int K, ...@@ -172,13 +174,13 @@ inline void run_benchmark(const std::vector<KernelCase> &cases, int M, int K,
} }
// 2. Warmup // 2. Warmup
for (int i = 0; i < 100; ++i) { for (int i = 0; i < warmups; ++i) {
k.func(M, K, alpha, A, lda, x, beta, y); k.func(M, K, alpha, A, lda, x, beta, y);
} }
checkHipErrors(hipDeviceSynchronize()); checkHipErrors(hipDeviceSynchronize());
// 3. Timing // 3. Timing
int num_runs = 100; int num_runs = loops;
checkHipErrors(hipEventRecord(start)); checkHipErrors(hipEventRecord(start));
for (int i = 0; i < num_runs; ++i) { for (int i = 0; i < num_runs; ++i) {
k.func(M, K, alpha, A, lda, x, beta, y); k.func(M, K, alpha, A, lda, x, beta, y);
......
#include "gemv_bf16.h" #include "gemv_bf16.h"
int main(int argc, char **argv) { int main(int argc, char **argv) {
int warmups = 100;
int loops = 2000;
bool do_verify = false; bool do_verify = false;
float alpha = 1.0f; float alpha = 1.0f;
float beta = 0.0f; float beta = 0.0f;
...@@ -10,6 +12,14 @@ int main(int argc, char **argv) { ...@@ -10,6 +12,14 @@ int main(int argc, char **argv) {
int lda = K; int lda = K;
int block_size = 256; int block_size = 256;
if (char *value = getCmdOption(argv, argv + argc, "--warmups")) {
warmups = std::stoi(value);
}
if (char *value = getCmdOption(argv, argv + argc, "--loops")) {
loops = std::stoi(value);
}
if (char *value = getCmdOption(argv, argv + argc, "--verify")) { if (char *value = getCmdOption(argv, argv + argc, "--verify")) {
do_verify = std::stoi(value) == 1; do_verify = std::stoi(value) == 1;
} }
...@@ -244,7 +254,8 @@ int main(int argc, char **argv) { ...@@ -244,7 +254,8 @@ int main(int argc, char **argv) {
}}); }});
// 运行所有测试 // 运行所有测试
run_benchmark(kernels, M, K, alpha, d_A, lda, d_x, beta, d_y, do_verify); run_benchmark(warmups, loops, kernels, M, K, alpha, d_A, lda, d_x, beta, d_y,
do_verify);
// 清理 // 清理
checkHipErrors(hipFree(d_A)); checkHipErrors(hipFree(d_A));
......
...@@ -4,23 +4,24 @@ set -e ...@@ -4,23 +4,24 @@ set -e
export HIP_VISIBLE_DEVICES=1 export HIP_VISIBLE_DEVICES=1
BIND_CMD="numactl -N 0 -m 0" BIND_CMD="numactl -N 0 -m 0"
CXX=hipcc make make clean
CXX=hipcc make GPU_ARCH=gfx936
# CXX=nvcc make GPU_ARCH=sm_80 # CXX=nvcc make GPU_ARCH=sm_80
if [[ "$*" == *"--pmc"* ]]; then if [[ "$*" == *"--pmc"* ]]; then
PROF_CMD="hipprof --trace-off --pmc --pmc-type 3" PROF_CMD="hipprof --trace-off --pmc"
${PROF_CMD} -o log/pmc-k1 ${BIND_CMD} ./gemv_bench --verify 1 -M 11264 -K 4096 ${PROF_CMD} -o log/pmc-w1 ${BIND_CMD} ./gemv_bench --warmups 10 --loops 20 --verify 1 -M 11264 -K 4096
${PROF_CMD} -o log/pmc-k2 ${BIND_CMD} ./gemv_bench --verify 1 -M 4096 -K 11264 ${PROF_CMD} -o log/pmc-w2 ${BIND_CMD} ./gemv_bench --warmups 10 --loops 20 --verify 1 -M 4096 -K 11264
${PROF_CMD} -o log/pmc-k3 ${BIND_CMD} ./gemv_bench --verify 1 -M 12288 -K 4096 ${PROF_CMD} -o log/pmc-w3 ${BIND_CMD} ./gemv_bench --warmups 10 --loops 20 --verify 1 -M 12288 -K 4096
${PROF_CMD} -o log/pmc-k4 ${BIND_CMD} ./gemv_bench --verify 1 -M 4096 -K 4096 ${PROF_CMD} -o log/pmc-w4 ${BIND_CMD} ./gemv_bench --warmups 10 --loops 20 --verify 1 -M 4096 -K 4096
elif [[ "$*" == *"--trace"* ]]; then elif [[ "$*" == *"--trace"* ]]; then
PROF_CMD="hipprof --hip-trace" PROF_CMD="hipprof --hip-trace"
${PROF_CMD} -o log/trace-k1 ${BIND_CMD} ./gemv_bench --verify 1 -M 11264 -K 4096 ${PROF_CMD} -o log/trace-w1 ${BIND_CMD} ./gemv_bench --warmups 100 --loops 1000 --verify 1 -M 11264 -K 4096
${PROF_CMD} -o log/trace-k2 ${BIND_CMD} ./gemv_bench --verify 1 -M 4096 -K 11264 ${PROF_CMD} -o log/trace-w2 ${BIND_CMD} ./gemv_bench --warmups 100 --loops 1000 --verify 1 -M 4096 -K 11264
${PROF_CMD} -o log/trace-k3 ${BIND_CMD} ./gemv_bench --verify 1 -M 12288 -K 4096 ${PROF_CMD} -o log/trace-w3 ${BIND_CMD} ./gemv_bench --warmups 100 --loops 1000 --verify 1 -M 12288 -K 4096
${PROF_CMD} -o log/trace-k4 ${BIND_CMD} ./gemv_bench --verify 1 -M 4096 -K 4096 ${PROF_CMD} -o log/trace-w4 ${BIND_CMD} ./gemv_bench --warmups 100 --loops 1000 --verify 1 -M 4096 -K 4096
else else
${BIND_CMD} ./gemv_bench --verify 1 -M 11264 -K 4096 ${BIND_CMD} ./gemv_bench --warmups 100 --loops 2000 --verify 1 -M 11264 -K 4096
${BIND_CMD} ./gemv_bench --verify 1 -M 4096 -K 11264 ${BIND_CMD} ./gemv_bench --warmups 100 --loops 2000 --verify 1 -M 4096 -K 11264
${BIND_CMD} ./gemv_bench --verify 1 -M 12288 -K 4096 ${BIND_CMD} ./gemv_bench --warmups 100 --loops 2000 --verify 1 -M 12288 -K 4096
${BIND_CMD} ./gemv_bench --verify 1 -M 4096 -K 4096 ${BIND_CMD} ./gemv_bench --warmups 100 --loops 2000 --verify 1 -M 4096 -K 4096
fi fi
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment