Commit b3a56179 authored by one's avatar one
Browse files

Update GEMV benchmarks

parent 977247a7
...@@ -3,7 +3,7 @@ CXXFLAGS ?= -std=c++17 -O3 ...@@ -3,7 +3,7 @@ CXXFLAGS ?= -std=c++17 -O3
OFFLOAD_ARCH ?= gfx936 OFFLOAD_ARCH ?= gfx936
TARGET := gemv_bench TARGET := gemv_bench
SRC := gemv_bf16.hip SRC := gemv_bf16.cpp
DEP := gemv_utils.h DEP := gemv_utils.h
.PHONY: all clean .PHONY: all clean
......
This diff is collapsed.
This diff is collapsed.
...@@ -37,7 +37,8 @@ inline char *getCmdOption(char **begin, char **end, const std::string &option) { ...@@ -37,7 +37,8 @@ inline char *getCmdOption(char **begin, char **end, const std::string &option) {
// -------------------------------------------------------------------------------- // --------------------------------------------------------------------------------
inline void gemv_cpu(int M, int K, float alpha, const hip_bfloat16 *h_A, inline void gemv_cpu(int M, int K, float alpha, const hip_bfloat16 *h_A,
int lda, const hip_bfloat16 *h_x, hip_bfloat16 *h_y) { int lda, const hip_bfloat16 *h_x, float beta,
hip_bfloat16 *h_y) {
for (int m = 0; m < M; ++m) { for (int m = 0; m < M; ++m) {
float sum = 0.0f; float sum = 0.0f;
for (int k = 0; k < K; ++k) { for (int k = 0; k < K; ++k) {
...@@ -45,7 +46,7 @@ inline void gemv_cpu(int M, int K, float alpha, const hip_bfloat16 *h_A, ...@@ -45,7 +46,7 @@ inline void gemv_cpu(int M, int K, float alpha, const hip_bfloat16 *h_A,
float val_x = static_cast<float>(h_x[k]); float val_x = static_cast<float>(h_x[k]);
sum += val_a * val_x; sum += val_a * val_x;
} }
h_y[m] = hip_bfloat16(alpha * sum); h_y[m] = hip_bfloat16(alpha * sum + beta * h_y[m]);
} }
return; return;
...@@ -85,9 +86,9 @@ inline bool verify_result(int M, const hip_bfloat16 *h_y_gpu, ...@@ -85,9 +86,9 @@ inline bool verify_result(int M, const hip_bfloat16 *h_y_gpu,
// -------------------------------------------------------------------------------- // --------------------------------------------------------------------------------
// 定义统一的 Kernel Launcher 签名 // 定义统一的 Kernel Launcher 签名
using KernelLauncher = using KernelLauncher = std::function<void(
std::function<void(int M, int K, float alpha, const hip_bfloat16 *A, int M, int K, float alpha, const hip_bfloat16 *A, int lda,
int lda, const hip_bfloat16 *x, hip_bfloat16 *y)>; const hip_bfloat16 *x, float beta, hip_bfloat16 *y)>;
struct KernelCase { struct KernelCase {
std::string name; std::string name;
...@@ -96,7 +97,7 @@ struct KernelCase { ...@@ -96,7 +97,7 @@ struct KernelCase {
inline void run_benchmark(const std::vector<KernelCase> &cases, int M, int K, inline void run_benchmark(const std::vector<KernelCase> &cases, int M, int K,
float alpha, const hip_bfloat16 *A, int lda, float alpha, const hip_bfloat16 *A, int lda,
const hip_bfloat16 *x, hip_bfloat16 *y, const hip_bfloat16 *x, float beta, hip_bfloat16 *y,
bool do_verify) { bool do_verify) {
std::cout << "GEMV Benchmarks" << std::endl; std::cout << "GEMV Benchmarks" << std::endl;
...@@ -120,7 +121,7 @@ inline void run_benchmark(const std::vector<KernelCase> &cases, int M, int K, ...@@ -120,7 +121,7 @@ inline void run_benchmark(const std::vector<KernelCase> &cases, int M, int K,
hipMemcpyDeviceToHost)); hipMemcpyDeviceToHost));
// 计算 CPU Reference // 计算 CPU Reference
gemv_cpu(M, K, alpha, h_A.data(), lda, h_x.data(), h_y_ref.data()); gemv_cpu(M, K, alpha, h_A.data(), lda, h_x.data(), beta, h_y_ref.data());
} }
// 列宽 // 列宽
...@@ -143,7 +144,7 @@ inline void run_benchmark(const std::vector<KernelCase> &cases, int M, int K, ...@@ -143,7 +144,7 @@ inline void run_benchmark(const std::vector<KernelCase> &cases, int M, int K,
checkHipErrors(hipMemset(y, 0, M * sizeof(hip_bfloat16))); checkHipErrors(hipMemset(y, 0, M * sizeof(hip_bfloat16)));
// 运行一次 // 运行一次
k.func(M, K, alpha, A, lda, x, y); k.func(M, K, alpha, A, lda, x, beta, y);
checkHipErrors(hipDeviceSynchronize()); checkHipErrors(hipDeviceSynchronize());
// 拷回结果 // 拷回结果
...@@ -159,7 +160,7 @@ inline void run_benchmark(const std::vector<KernelCase> &cases, int M, int K, ...@@ -159,7 +160,7 @@ inline void run_benchmark(const std::vector<KernelCase> &cases, int M, int K,
// 2. Warmup // 2. Warmup
for (int i = 0; i < 100; ++i) { for (int i = 0; i < 100; ++i) {
k.func(M, K, alpha, A, lda, x, y); k.func(M, K, alpha, A, lda, x, beta, y);
} }
checkHipErrors(hipDeviceSynchronize()); checkHipErrors(hipDeviceSynchronize());
...@@ -167,7 +168,7 @@ inline void run_benchmark(const std::vector<KernelCase> &cases, int M, int K, ...@@ -167,7 +168,7 @@ inline void run_benchmark(const std::vector<KernelCase> &cases, int M, int K,
int num_runs = 1000; int num_runs = 1000;
checkHipErrors(hipEventRecord(start)); checkHipErrors(hipEventRecord(start));
for (int i = 0; i < num_runs; ++i) { for (int i = 0; i < num_runs; ++i) {
k.func(M, K, alpha, A, lda, x, y); k.func(M, K, alpha, A, lda, x, beta, y);
} }
checkHipErrors(hipEventRecord(stop)); checkHipErrors(hipEventRecord(stop));
checkHipErrors(hipEventSynchronize(stop)); checkHipErrors(hipEventSynchronize(stop));
...@@ -184,8 +185,8 @@ inline void run_benchmark(const std::vector<KernelCase> &cases, int M, int K, ...@@ -184,8 +185,8 @@ inline void run_benchmark(const std::vector<KernelCase> &cases, int M, int K,
double bytes_moved = (double)(M * K + K + M) * sizeof(hip_bfloat16); double bytes_moved = (double)(M * K + K + M) * sizeof(hip_bfloat16);
double bw = bytes_moved / (avg_ms * 1e-3) / 1e9; double bw = bytes_moved / (avg_ms * 1e-3) / 1e9;
printf("%-38s %10.1f %10.2f %10.2f %8s\n", k.name.c_str(), avg_ms * 1e3, gflops, printf("%-38s %10.1f %10.2f %10.2f %8s\n", k.name.c_str(), avg_ms * 1e3,
bw, result_status.c_str()); gflops, bw, result_status.c_str());
} }
std::cout << std::string(w_table, '-') << std::endl; std::cout << std::string(w_table, '-') << std::endl;
......
#!/bin/bash #!/bin/bash
# BW150
export HIP_VISIBLE_DEVICES=1
BIND_CMD="numactl -N 0 -m 0"
make make
# BW150 if [[ "$*" == *"--trace"* ]]; then
export HIP_VISIBLE_DEVICES=4 PROF_CMD="hipprof --trace-off --pmc"
${PROF_CMD} -o log/pmc-k1 ${BIND_CMD} ./gemv_bench --verify 1 -M 11264 -K 4096
hipprof numactl -N 0 -m 0 ./gemv_bench --verify 1 -M 11264 -K 4096 ${PROF_CMD} -o log/pmc-k2 ${BIND_CMD} ./gemv_bench --verify 1 -M 4096 -K 11264
hipprof numactl -N 0 -m 0 ./gemv_bench --verify 1 -M 4096 -K 11264 ${PROF_CMD} -o log/pmc-k3 ${BIND_CMD} ./gemv_bench --verify 1 -M 12288 -K 4096
hipprof numactl -N 0 -m 0 ./gemv_bench --verify 1 -M 12288 -K 4096 ${PROF_CMD} -o log/pmc-k4 ${BIND_CMD} ./gemv_bench --verify 1 -M 4096 -K 4096
hipprof numactl -N 0 -m 0 ./gemv_bench --verify 1 -M 4096 -K 4096 else
\ No newline at end of file ${BIND_CMD} ./gemv_bench --verify 1 -M 11264 -K 4096
${BIND_CMD} ./gemv_bench --verify 1 -M 4096 -K 11264
${BIND_CMD} ./gemv_bench --verify 1 -M 12288 -K 4096
${BIND_CMD} ./gemv_bench --verify 1 -M 4096 -K 4096
fi
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment