Add GEMV benchmarks

977247a7 · one · 6ee19e2f · 977247a7 · 977247a7 · 977247a7
Commit 977247a7 authored Jan 27, 2026 by one
Showing with 1141 additions and 0 deletions

evo2/bm/Makefile evo2/bm/Makefile +17 -0

evo2/bm/gemv_bf16.hip evo2/bm/gemv_bf16.hip +916 -0

evo2/bm/gemv_utils.h evo2/bm/gemv_utils.h +197 -0

evo2/bm/run-all.sh evo2/bm/run-all.sh +11 -0

No files found.
--- a/evo2/bm/Makefile
+++ b/evo2/bm/Makefile
+HIPCC ?= hipcc
+CXXFLAGS ?= -std=c++17 -O3
+OFFLOAD_ARCH ?= gfx936
+
+TARGET := gemv_bench
+SRC := gemv_bf16.hip
+DEP := gemv_utils.h
+
+.PHONY: all clean
+
+all: $(TARGET)
+
+$(TARGET): $(SRC) $(DEP)
+	$(HIPCC) $(CXXFLAGS) --offload-arch=$(OFFLOAD_ARCH) $< -o $@
+
+clean:
+	rm -f $(TARGET)
--- a/evo2/bm/gemv_bf16.hip
+++ b/evo2/bm/gemv_bf16.hip
--- a/evo2/bm/gemv_utils.h
+++ b/evo2/bm/gemv_utils.h
+#pragma once
+
+#include <algorithm>
+#include <functional>
+#include <hip/hip_bfloat16.h>
+#include <hip/hip_runtime.h>
+#include <iomanip>
+#include <iostream>
+#include <string>
+#include <vector>
+
+// --------------------------------------------------------------------------------
+// Error Handling
+// --------------------------------------------------------------------------------
+
+inline void checkHipErrors(hipError_t result) {
+  if (result != hipSuccess) {
+    std::cerr << "HIP Error: " << hipGetErrorString(result) << std::endl;
+    exit(1);
+  }
+}
+
+// --------------------------------------------------------------------------------
+// Command Line Parsing
+// --------------------------------------------------------------------------------
+
+inline char *getCmdOption(char **begin, char **end, const std::string &option) {
+  char **itr = std::find(begin, end, option);
+  if (itr != end && ++itr != end) {
+    return *itr;
+  }
+  return 0;
+}
+
+// --------------------------------------------------------------------------------
+// CPU Reference & Verification
+// --------------------------------------------------------------------------------
+
+inline void gemv_cpu(int M, int K, float alpha, const hip_bfloat16 *h_A,
+                     int lda, const hip_bfloat16 *h_x, hip_bfloat16 *h_y) {
+  for (int m = 0; m < M; ++m) {
+    float sum = 0.0f;
+    for (int k = 0; k < K; ++k) {
+      float val_a = static_cast<float>(h_A[m * lda + k]);
+      float val_x = static_cast<float>(h_x[k]);
+      sum += val_a * val_x;
+    }
+    h_y[m] = hip_bfloat16(alpha * sum);
+  }
+
+  return;
+}
+
+inline bool verify_result(int M, const hip_bfloat16 *h_y_gpu,
+                          const hip_bfloat16 *h_y_ref) {
+  float max_diff = 0.0f;
+  int err_count = 0;
+  for (int i = 0; i < M; ++i) {
+    float gpu_val = static_cast<float>(h_y_gpu[i]);
+    float cpu_val = static_cast<float>(h_y_ref[i]);
+    float diff = std::abs(gpu_val - cpu_val);
+
+    // bfloat16 的精度有限，容忍度需要稍大一点
+    // 同时也考虑数值大小，这里使用简单的绝对误差 + 相对误差阈值
+    if (diff > 0.1f && diff / (std::abs(cpu_val) + 1e-6) > 0.01) {
+      if (err_count < 5) {
+        std::cerr << "Mismatch at index " << i << ": GPU=" << gpu_val
+                  << ", CPU=" << cpu_val << ", Diff=" << diff << std::endl;
+      }
+      err_count++;
+    }
+    max_diff = std::max(max_diff, diff);
+  }
+
+  if (err_count > 0) {
+    std::cerr << "Total mismatches: " << err_count << ", Max Diff: " << max_diff
+              << std::endl;
+    return false;
+  }
+  return true;
+}
+
+// --------------------------------------------------------------------------------
+// Benchmark Framework
+// --------------------------------------------------------------------------------
+
+// 定义统一的 Kernel Launcher 签名
+using KernelLauncher =
+    std::function<void(int M, int K, float alpha, const hip_bfloat16 *A,
+                       int lda, const hip_bfloat16 *x, hip_bfloat16 *y)>;
+
+struct KernelCase {
+  std::string name;
+  KernelLauncher func;
+};
+
+inline void run_benchmark(const std::vector<KernelCase> &cases, int M, int K,
+                          float alpha, const hip_bfloat16 *A, int lda,
+                          const hip_bfloat16 *x, hip_bfloat16 *y,
+                          bool do_verify) {
+
+  std::cout << "GEMV Benchmarks" << std::endl;
+
+  hipEvent_t start, stop;
+  checkHipErrors(hipEventCreate(&start));
+  checkHipErrors(hipEventCreate(&stop));
+
+  // 准备 verification 数据
+  std::vector<hip_bfloat16> h_y_ref(M);
+  std::vector<hip_bfloat16> h_y_gpu(M);
+  std::vector<hip_bfloat16> h_A(M * K);
+  std::vector<hip_bfloat16> h_x(K);
+
+  if (do_verify) {
+    std::cout << "Verifying results against CPU reference..." << std::endl;
+    // 把 A 和 x 拷回 host 端计算
+    checkHipErrors(hipMemcpy(h_A.data(), A, M * K * sizeof(hip_bfloat16),
+                             hipMemcpyDeviceToHost));
+    checkHipErrors(hipMemcpy(h_x.data(), x, K * sizeof(hip_bfloat16),
+                             hipMemcpyDeviceToHost));
+
+    // 计算 CPU Reference
+    gemv_cpu(M, K, alpha, h_A.data(), lda, h_x.data(), h_y_ref.data());
+  }
+
+  // 列宽
+  const int w_table = 80;
+
+  // 表头
+  printf("%s\n", std::string(w_table, '-').c_str());
+  printf("M=%d, K=%d, N=1\n", M, K);
+  printf("lda=%d\n", lda);
+  printf("%s\n", std::string(w_table, '-').c_str());
+  printf("%-38s %10s %10s %10s %8s\n", "Kernel Name", "Time (us)", "GFLOPS",
+         "BW (GB/s)", "Result");
+
+  for (const auto &k : cases) {
+    std::string result_status = "Skipped";
+
+    // 1. Verification (如果启用)
+    if (do_verify) {
+      // 清零 d_y
+      checkHipErrors(hipMemset(y, 0, M * sizeof(hip_bfloat16)));
+
+      // 运行一次
+      k.func(M, K, alpha, A, lda, x, y);
+      checkHipErrors(hipDeviceSynchronize());
+
+      // 拷回结果
+      checkHipErrors(hipMemcpy(h_y_gpu.data(), y, M * sizeof(hip_bfloat16),
+                               hipMemcpyDeviceToHost));
+
+      if (verify_result(M, h_y_gpu.data(), h_y_ref.data())) {
+        result_status = "PASS";
+      } else {
+        result_status = "FAIL";
+      }
+    }
+
+    // 2. Warmup
+    for (int i = 0; i < 100; ++i) {
+      k.func(M, K, alpha, A, lda, x, y);
+    }
+    checkHipErrors(hipDeviceSynchronize());
+
+    // 3. Timing
+    int num_runs = 1000;
+    checkHipErrors(hipEventRecord(start));
+    for (int i = 0; i < num_runs; ++i) {
+      k.func(M, K, alpha, A, lda, x, y);
+    }
+    checkHipErrors(hipEventRecord(stop));
+    checkHipErrors(hipEventSynchronize(stop));
+
+    float total_ms = 0;
+    checkHipErrors(hipEventElapsedTime(&total_ms, start, stop));
+    float avg_ms = total_ms / num_runs;
+
+    // 4. Metrics
+    double gflops = (2.0 * M * K) / (avg_ms * 1e-3) / 1e9;
+
+    // Bandwidth = Read A + Read x + Write y
+    // A: M*K, x: K, y: M
+    double bytes_moved = (double)(M * K + K + M) * sizeof(hip_bfloat16);
+    double bw = bytes_moved / (avg_ms * 1e-3) / 1e9;
+
+    printf("%-38s %10.1f %10.2f %10.2f %8s\n", k.name.c_str(), avg_ms * 1e3, gflops,
+           bw, result_status.c_str());
+  }
+
+  std::cout << std::string(w_table, '-') << std::endl;
+
+  checkHipErrors(hipEventDestroy(start));
+  checkHipErrors(hipEventDestroy(stop));
+
+  return;
+}
--- a/evo2/bm/run-all.sh
+++ b/evo2/bm/run-all.sh
+#!/bin/bash
+
+make
+
+# BW150
+export HIP_VISIBLE_DEVICES=4
+
+hipprof numactl -N 0 -m 0 ./gemv_bench --verify 1 -M 11264 -K 4096
+hipprof numactl -N 0 -m 0 ./gemv_bench --verify 1 -M 4096 -K 11264
+hipprof numactl -N 0 -m 0 ./gemv_bench --verify 1 -M 12288 -K 4096
+hipprof numactl -N 0 -m 0 ./gemv_bench --verify 1 -M 4096 -K 4096
\ No newline at end of file