Commit 845b2d24 authored by one's avatar one
Browse files

Update GEMV benchmarks and evo2 scripts

parent 8ac49790
...@@ -123,10 +123,12 @@ def main(): ...@@ -123,10 +123,12 @@ def main():
# Read and process sequences # Read and process sequences
sequences = read_prompts('prompts.csv') sequences = read_prompts('prompts.csv')
# DEBUG: replace all prompts with the longest prompt to enable uniform lengths # Debugging: replace all prompts with the longest prompt
if args.batch_size > 1:
longest_prompt = max(sequences, key=len) longest_prompt = max(sequences, key=len)
sequences = [longest_prompt] * len(sequences) sequences = [longest_prompt] * len(sequences)
print(f"[debug] Using longest prompt len={len(longest_prompt)} for all sequences") print(f"[debug] Using longest prompt len={len(longest_prompt)} for all sequences")
scores = generate_and_score( scores = generate_and_score(
sequences=sequences, sequences=sequences,
model=model, model=model,
......
...@@ -4,31 +4,45 @@ ...@@ -4,31 +4,45 @@
// Warp Size 根据架构自动选择 // Warp Size 根据架构自动选择
#if defined(__HIP_PLATFORM_AMD__) #if defined(__HIP_PLATFORM_AMD__)
#define WARP_SIZE 64 // DCU #define WARP_SIZE 64 // Hygon/AMD
#else #else
#define WARP_SIZE 32 // NVIDIA #define WARP_SIZE 32 // Nvidia
#endif #endif
#define VEC_WIDTH 8 #define VEC_WIDTH 8
#define OFFSET(i, j, lda) ((i) + (j) * (lda)) #define OFFSET(i, j, lda) ((i) + (j) * (lda))
#define OFFSET_T(i, j, lda) ((i) * (lda) + (j)) #define OFFSET_T(i, j, lda) ((i) * (lda) + (j))
/**
* 平台相关的 Shared Memory / LDS
*/
#if defined(__HIP_PLATFORM_AMD__)
// Hygon/AMD: 64KB LDS per CU
constexpr int MAX_SHMEM_BYTES_PER_BLOCK = 65536;
#else
// Nvidia: 48KB
constexpr int MAX_SHMEM_BYTES_PER_BLOCK = 49152;
#endif
/** /**
* 根据需求的并发 block 数量计算 shmem 用量(即 TILE_K 指定的 BF16 元素个数) * 根据需求的并发 block 数量计算 shmem 用量(即 TILE_K 指定的 BF16 元素个数)
* *
* AlignElements 为对齐粒度,即元素个数,默认 128-bit 对齐。 * AlignElements 为对齐粒度,即元素个数,默认 128-bit 对齐。
* - 8: 对齐到 128-bit (可能有利于 load128b) * - 8: 对齐到 128-bit (可能有利于 load128b)
* - 16: 对齐到 256-bit (某些 MFMA 指令需求) * - 16: 对齐到 256-bit (某些 MFMA 指令需求)
*
* concurrent_blocks: 期望的并发 block 数(用于计算可用 shmem)
* - Hygon/AMD: 表示每个 CU 上的并发 block 数
* - Nvidia: 设置为 1 即可(每个 block 独立使用 shmem)
*/ */
template <int AlignElements = 8> template <int AlignElements = 8>
constexpr int calculate_tile_k(int concurrent_blocks) { constexpr int calculate_tile_k(int concurrent_blocks = 1) {
// 安全检查 // 安全检查
if (concurrent_blocks < 1) if (concurrent_blocks < 1)
concurrent_blocks = 1; concurrent_blocks = 1;
// 直接切分 LDS // 计算每个 block 可用的 shmem
constexpr int MAX_LDS_BYTES_PER_CU = 65536; int bytes_per_block = MAX_SHMEM_BYTES_PER_BLOCK / concurrent_blocks;
int bytes_per_block = MAX_LDS_BYTES_PER_CU / concurrent_blocks;
// 转为元素个数 // 转为元素个数
int max_elements = bytes_per_block / sizeof(hip_bfloat16); int max_elements = bytes_per_block / sizeof(hip_bfloat16);
...@@ -50,14 +64,19 @@ typedef float __attribute__((ext_vector_type(4))) float4_native; ...@@ -50,14 +64,19 @@ typedef float __attribute__((ext_vector_type(4))) float4_native;
/// 128-bit non-temporal load 或者 cached load /// 128-bit non-temporal load 或者 cached load
template <bool USE_NTL = false> template <bool USE_NTL = false>
__device__ __forceinline__ bf16_x8 load_128b(const hip_bfloat16 *src) { __device__ __forceinline__ bf16_x8 load_128b(const hip_bfloat16 *src) {
if constexpr (USE_NTL) {
#if defined(__NVCC__) || defined(__CUDACC__) #if defined(__NVCC__) || defined(__CUDACC__)
// NVIDIA 平台:直接使用普通加载 // Nvidia 平台:PTX 内联汇编实现 cache streaming (ld.global.cs)
// NVCC 的优化器通常会自动选择合适的加载指令(如 LDG) uint4 tmp; // 128-bit = 4 x 32-bit
// 如果需要显式控制,可以使用 __ldg() 或 PTX 内联汇编
return *reinterpret_cast<const bf16_x8 *>(src); asm volatile("ld.global.cs.v4.u32 {%0, %1, %2, %3}, [%4];"
: "=r"(tmp.x), "=r"(tmp.y), "=r"(tmp.z), "=r"(tmp.w)
: "l"(src)
: "memory");
return *reinterpret_cast<bf16_x8 *>(&tmp);
#else #else
if constexpr (USE_NTL) { // Hygon/AMD 平台:使用 Clang 内置 non-temporal load 函数
// DCU:使用 Clang 内置 non-temporal load 函数
// 把地址转换为 float4_native 指针 // 把地址转换为 float4_native 指针
const float4_native *ptr = reinterpret_cast<const float4_native *>(src); const float4_native *ptr = reinterpret_cast<const float4_native *>(src);
...@@ -66,10 +85,10 @@ __device__ __forceinline__ bf16_x8 load_128b(const hip_bfloat16 *src) { ...@@ -66,10 +85,10 @@ __device__ __forceinline__ bf16_x8 load_128b(const hip_bfloat16 *src) {
// 把加载到的 128 位数据重新解释为 bf16_x8 // 把加载到的 128 位数据重新解释为 bf16_x8
return *reinterpret_cast<bf16_x8 *>(&tmp); return *reinterpret_cast<bf16_x8 *>(&tmp);
#endif
} else { } else {
return *reinterpret_cast<const bf16_x8 *>(src); return *reinterpret_cast<const bf16_x8 *>(src);
} }
#endif
} }
/** y = alpha * A^T * x + 0 * y /** y = alpha * A^T * x + 0 * y
......
...@@ -70,8 +70,12 @@ int main(int argc, char **argv) { ...@@ -70,8 +70,12 @@ int main(int argc, char **argv) {
constexpr bool NTL = true; constexpr bool NTL = true;
constexpr int UNROLL = 4; constexpr int UNROLL = 4;
constexpr int TILE_K = calculate_tile_k<8>(4);
constexpr int ROWS_PER_WARP = 2; constexpr int ROWS_PER_WARP = 2;
#if defined(__HIP_PLATFORM_AMD__)
constexpr int TILE_K = calculate_tile_k<8>(4);
#else
constexpr int TILE_K = calculate_tile_k<8>(1);
#endif
kernels.push_back( kernels.push_back(
{"naive", [&](int M, int K, float alpha, const hip_bfloat16 *A, int lda, {"naive", [&](int M, int K, float alpha, const hip_bfloat16 *A, int lda,
......
...@@ -5,6 +5,7 @@ export HIP_VISIBLE_DEVICES=1 ...@@ -5,6 +5,7 @@ export HIP_VISIBLE_DEVICES=1
BIND_CMD="numactl -N 0 -m 0" BIND_CMD="numactl -N 0 -m 0"
CXX=hipcc make CXX=hipcc make
# CXX=nvcc make GPU_ARCH=sm_80
if [[ "$*" == *"--pmc"* ]]; then if [[ "$*" == *"--pmc"* ]]; then
PROF_CMD="hipprof --trace-off --pmc --pmc-type 3" PROF_CMD="hipprof --trace-off --pmc --pmc-type 3"
${PROF_CMD} -o log/pmc-k1 ${BIND_CMD} ./gemv_bench --verify 1 -M 11264 -K 4096 ${PROF_CMD} -o log/pmc-k1 ${BIND_CMD} ./gemv_bench --verify 1 -M 11264 -K 4096
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment