Update GEMV benchmarks and evo2 scripts

845b2d24 · one · 8ac49790 · 845b2d24 · 845b2d24 · 845b2d24
Commit 845b2d24 authored Jan 29, 2026 by one
Showing with 44 additions and 18 deletions

evo2/test_evo2_generation_batched.py evo2/test_evo2_generation_batched.py +6 -4

gemv/gemv_bf16.h gemv/gemv_bf16.h +32 -13

gemv/main.cpp gemv/main.cpp +5 -1

gemv/run-all.sh gemv/run-all.sh +1 -0

No files found.
--- a/evo2/test_evo2_generation_batched.py
+++ b/evo2/test_evo2_generation_batched.py
@@ -123,10 +123,12 @@ def main():
    
    # Read and process sequences
    sequences = read_prompts('prompts.csv')
-    # DEBUG: replace all prompts with the longest prompt to enable uniform lengths
+    # Debugging: replace all prompts with the longest prompt
+    if args.batch_size > 1:
        longest_prompt = max(sequences, key=len)
        sequences = [longest_prompt] * len(sequences)
        print(f"[debug] Using longest prompt len={len(longest_prompt)} for all sequences")
+
    scores = generate_and_score(
        sequences=sequences,
        model=model,

--- a/gemv/gemv_bf16.h
+++ b/gemv/gemv_bf16.h
@@ -4,31 +4,45 @@

 // Warp Size 根据架构自动选择
 #if defined(__HIP_PLATFORM_AMD__)
-#define WARP_SIZE 64 // DCU
+#define WARP_SIZE 64 // Hygon/AMD
 #else
-#define WARP_SIZE 32 // NVIDIA
+#define WARP_SIZE 32 // Nvidia
 #endif

 #define VEC_WIDTH 8
 #define OFFSET(i, j, lda) ((i) + (j) * (lda))
 #define OFFSET_T(i, j, lda) ((i) * (lda) + (j))

+/**
+ * 平台相关的 Shared Memory / LDS
+ */
+#if defined(__HIP_PLATFORM_AMD__)
+  // Hygon/AMD: 64KB LDS per CU
+  constexpr int MAX_SHMEM_BYTES_PER_BLOCK = 65536;
+#else
+  // Nvidia: 48KB
+  constexpr int MAX_SHMEM_BYTES_PER_BLOCK = 49152;
+#endif
+
 /**
 * 根据需求的并发 block 数量计算 shmem 用量（即 TILE_K 指定的 BF16 元素个数）
 *
 * AlignElements 为对齐粒度，即元素个数，默认 128-bit 对齐。
 * - 8:  对齐到 128-bit (可能有利于 load128b)
 * - 16: 对齐到 256-bit (某些 MFMA 指令需求)
+ * 
+ * concurrent_blocks: 期望的并发 block 数（用于计算可用 shmem）
+ * - Hygon/AMD: 表示每个 CU 上的并发 block 数
+ * - Nvidia: 设置为 1 即可（每个 block 独立使用 shmem）
 */
 template <int AlignElements = 8>
-constexpr int calculate_tile_k(int concurrent_blocks) {
+constexpr int calculate_tile_k(int concurrent_blocks = 1) {
  // 安全检查
  if (concurrent_blocks < 1)
    concurrent_blocks = 1;

-  // 直接切分 LDS
-  constexpr int MAX_LDS_BYTES_PER_CU = 65536;
-  int bytes_per_block = MAX_LDS_BYTES_PER_CU / concurrent_blocks;
+  // 计算每个 block 可用的 shmem
+  int bytes_per_block = MAX_SHMEM_BYTES_PER_BLOCK / concurrent_blocks;

  // 转为元素个数
  int max_elements = bytes_per_block / sizeof(hip_bfloat16);
@@ -50,14 +64,19 @@ typedef float __attribute__((ext_vector_type(4))) float4_native;
 /// 128-bit non-temporal load 或者 cached load
 template <bool USE_NTL = false>
 __device__ __forceinline__ bf16_x8 load_128b(const hip_bfloat16 *src) {
+  if constexpr (USE_NTL) {
 #if defined(__NVCC__) || defined(__CUDACC__)
-  // NVIDIA 平台：直接使用普通加载
-  // NVCC 的优化器通常会自动选择合适的加载指令（如 LDG）
-  // 如果需要显式控制，可以使用 __ldg() 或 PTX 内联汇编
-  return *reinterpret_cast<const bf16_x8 *>(src);
+    // Nvidia 平台：PTX 内联汇编实现 cache streaming (ld.global.cs)
+    uint4 tmp; // 128-bit = 4 x 32-bit
+
+    asm volatile("ld.global.cs.v4.u32 {%0, %1, %2, %3}, [%4];"
+                 : "=r"(tmp.x), "=r"(tmp.y), "=r"(tmp.z), "=r"(tmp.w)
+                 : "l"(src)
+                 : "memory");
+
+    return *reinterpret_cast<bf16_x8 *>(&tmp);
 #else
-  if constexpr (USE_NTL) {
-    // DCU：使用 Clang 内置 non-temporal load 函数
+    // Hygon/AMD 平台：使用 Clang 内置 non-temporal load 函数
    // 把地址转换为 float4_native 指针
    const float4_native *ptr = reinterpret_cast<const float4_native *>(src);

@@ -66,10 +85,10 @@ __device__ __forceinline__ bf16_x8 load_128b(const hip_bfloat16 *src) {

    // 把加载到的 128 位数据重新解释为 bf16_x8
    return *reinterpret_cast<bf16_x8 *>(&tmp);
+#endif
  } else {
    return *reinterpret_cast<const bf16_x8 *>(src);
  }
-#endif
 }

 /** y = alpha * A^T * x + 0 * y

--- a/gemv/main.cpp
+++ b/gemv/main.cpp
@@ -70,8 +70,12 @@ int main(int argc, char **argv) {

  constexpr bool NTL = true;
  constexpr int UNROLL = 4;
-  constexpr int TILE_K = calculate_tile_k<8>(4);
  constexpr int ROWS_PER_WARP = 2;
+#if defined(__HIP_PLATFORM_AMD__)
+  constexpr int TILE_K = calculate_tile_k<8>(4);
+#else
+  constexpr int TILE_K = calculate_tile_k<8>(1);
+#endif

  kernels.push_back(
      {"naive", [&](int M, int K, float alpha, const hip_bfloat16 *A, int lda,

--- a/gemv/run-all.sh
+++ b/gemv/run-all.sh
@@ -5,6 +5,7 @@ export HIP_VISIBLE_DEVICES=1
 BIND_CMD="numactl -N 0 -m 0"

 CXX=hipcc make
+# CXX=nvcc make GPU_ARCH=sm_80
 if [[ "$*" == *"--pmc"* ]]; then
    PROF_CMD="hipprof --trace-off --pmc --pmc-type 3"
    ${PROF_CMD} -o log/pmc-k1 ${BIND_CMD} ./gemv_bench --verify 1 -M 11264 -K 4096