init gemm-test

4a4927e5 · sunzhq2 · d51e625f · 4a4927e5 · 4a4927e5 · 4a4927e5
Commit 4a4927e5 authored Apr 03, 2026 by sunzhq2
7 changed files
--- a/README.md
+++ b/README.md
-# GEMM-test
+# 镜像使用
+```
+harbor.sourcefind.cn:5443/dcu/admin/base/vllm:0.11.0-ubuntu22.04-dtk26.04-0130-py3.10-20260204
+```
+# rocblas-bench
+```
+rocblas-bench用于测试矩阵乘法，使用前先通过下面的命令配置可执行文件
+ln -s ${ROCM_PATH}/lib/rocblas/benchmark_tool/rocblas-bench ${ROCM_PATH}/bin/rocblas-bench
+chmod +x ${ROCM_PATH}/bin/rocblas-bench
+```
+```
+tar zxvf library_gpu5.tar.gz -C ./
+bash run_rocbals_bench.sh
+```
+# hipblaslt
+```
+hipblaslt-bench 用于测试W8A8，使用前先通过下面的命令配置可执行文件
+ln -s ${ROCM_PATH}/lib/hipblaslt/benchmark_tool/hipblaslt-bench ${ROCM_PATH}/bin/hipblaslt-bench
+chmod +x ${ROCM_PATH}/bin/hipblaslt-bench
+```
+```
+bash run_rocbals_bench.sh
+```
+# torch_gemm 测试
+- run_gemm_benchmarks_torch.sh 和 gemm_benchmark.py 是用 torch 实现的gemm测试
+- bash run_gemm_benchmarks_torch.sh
\ No newline at end of file
--- a/gemm_benchmark.py
+++ b/gemm_benchmark.py
+import torch
+import argparse
+import time
+import numpy as np
+def parse_args():
+    parser = argparse.ArgumentParser(description='gemm benchmark')
+    parser.add_argument('--M', type=int, default=4096, help='M')
+    parser.add_argument('--K', type=int, default=4096, help='K')
+    parser.add_argument('--N', type=int, default=4096, help='N')
+    parser.add_argument('--dtype', type=str, default='bfloat16', 
+                      choices=['float64', 'float32', 'float16', 'bfloat16', 'tf32', 
+                              'int8', 'mixed_fp16_fp32', 'mixed_bf16_fp32', 
+                              'mixed_int8_int32', 'mixed_tf32_fp32', 'w8a8'],
+                      help='测试数据类型')
+    parser.add_argument('--alpha', type=float, default=1.0, help='alpha')
+    parser.add_argument('--beta', type=float, default=0.0, help='beta')
+    parser.add_argument('--warmup_iterations', type=int, default=50, help='warmup次数')
+    parser.add_argument('--bench_iterations', type=int, default=1000, help='benchmark迭代次数')
+    parser.add_argument('--transA', action='store_true', default=False, 
+                      help='是否转置A矩阵')
+    parser.add_argument('--transB', action='store_true', default=False,
+                      help='是否转置B矩阵')
+    return parser.parse_args()
+def get_matrix(dims, dtype, device='cuda'):
+    """创建指定类型的矩阵"""
+    if dtype in [torch.float64, torch.float32, torch.float16, torch.bfloat16]:
+        return torch.randn(dims, dtype=dtype, device=device)
+    elif dtype == torch.int8:
+        return torch.randint(-128, 127, dims, dtype=torch.int8, device=device)
+    else:
+        return torch.randn(dims, dtype=torch.float32, device=device)
+def get_blas_op(alpha, beta, transA=False, transB=False):
+    """返回BLAS操作函数"""
+    def blas_op(a, b, c):
+        # torch.addmm 不支持直接指定转置，需要手动转置
+        a_op = a.t() if transA else a
+        b_op = b.t() if transB else b
+        return torch.addmm(c, a_op, b_op, beta=beta, alpha=alpha)
+    return blas_op
+def benchmark_gemm(args, dtype_config):
+    """执行GEMM基准测试"""
+    M, K, N = args.M, args.K, args.N
+    alpha, beta = args.alpha, args.beta
+    transA, transB = args.transA, args.transB
+    # 根据转置标志确定实际矩阵维度
+    a_rows, a_cols = (K, M) if transA else (M, K)
+    b_rows, b_cols = (N, K) if transB else (K, N)
+    # 解析数据类型配置
+    if dtype_config == 'mixed_fp16_fp32':
+        # A,B: fp16, C: fp32 - 不支持addmm，单独实现
+        a = torch.randn((a_rows, a_cols), dtype=torch.float16, device='cuda')
+        b = torch.randn((b_rows, b_cols), dtype=torch.float16, device='cuda')
+        c = torch.zeros((M, N), dtype=torch.float32, device='cuda')
+        def matmul_op(a, b, c):
+            a_op = a.t() if transA else a
+            b_op = b.t() if transB else b
+            result = torch.mm(a_op, b_op)  # 自动提升到fp32
+            if alpha != 1.0 or beta != 0.0:
+                result = alpha * result + beta * c
+            c.copy_(result)
+            return c
+    elif dtype_config == 'mixed_bf16_fp32':
+        # A,B: bf16, C: fp32 - 不支持addmm，单独实现
+        a = torch.randn((a_rows, a_cols), dtype=torch.bfloat16, device='cuda')
+        b = torch.randn((b_rows, b_cols), dtype=torch.bfloat16, device='cuda')
+        c = torch.zeros((M, N), dtype=torch.float32, device='cuda')
+        def matmul_op(a, b, c):
+            a_op = a.t() if transA else a
+            b_op = b.t() if transB else b
+            result = torch.mm(a_op, b_op)  # 自动提升到fp32
+            if alpha != 1.0 or beta != 0.0:
+                result = alpha * result + beta * c
+            c.copy_(result)
+            return c
+    elif dtype_config == 'mixed_int8_int32':
+        # A,B: int8, C: int32 - 不支持addmm，单独实现
+        a = torch.randint(-128, 127, (a_rows, a_cols), dtype=torch.int8, device='cuda')
+        b = torch.randint(-128, 127, (b_rows, b_cols), dtype=torch.int8, device='cuda')
+        c = torch.zeros((M, N), dtype=torch.int32, device='cuda')
+        if hasattr(torch, '_int_mm'):
+            print("  Using torch._int_mm for int8 matmul")
+            def matmul_op(a, b, c):
+                a_op = a.t() if transA else a
+                b_op = b.t() if transB else b
+                result = torch._int_mm(a_op, b_op)
+                if alpha != 1.0 or beta != 0.0:
+                    result = (alpha * result.float()).to(torch.int32) + beta * c
+                c.copy_(result)
+                return c
+        else:
+            print("  Warning: torch._int_mm not available, using fallback")
+            def matmul_op(a, b, c):
+                a_op = a.t() if transA else a
+                b_op = b.t() if transB else b
+                result = torch.mm(a_op.float(), b_op.float()).to(torch.int32)
+                if alpha != 1.0 or beta != 0.0:
+                    result = (alpha * result.float()).to(torch.int32) + beta * c
+                c.copy_(result)
+                return c
+    elif dtype_config == 'w8a8':
+        # W8A8: 权重int8, 激活fp16 - 不支持addmm，单独实现
+        a = torch.randn((a_rows, a_cols), dtype=torch.float16, device='cuda')
+        b = torch.randint(-128, 127, (b_rows, b_cols), dtype=torch.int8, device='cuda')
+        c = torch.zeros((M, N), dtype=torch.float16, device='cuda')
+        def matmul_op(a, b, c):
+            a_op = a.t() if transA else a
+            b_op = b.t() if transB else b
+            b_fp16 = b_op.to(torch.float16)
+            result = torch.mm(a_op, b_fp16)
+            if alpha != 1.0 or beta != 0.0:
+                result = alpha * result + beta * c
+            c.copy_(result)
+            return c
+    elif dtype_config == 'mixed_tf32_fp32':
+        # TF32模式 - 支持addmm
+        torch.backends.cuda.matmul.allow_tf32 = True
+        torch.backends.cudnn.allow_tf32 = True
+        a = torch.randn((a_rows, a_cols), dtype=torch.float32, device='cuda')
+        b = torch.randn((b_rows, b_cols), dtype=torch.float32, device='cuda')
+        c = torch.zeros((M, N), dtype=torch.float32, device='cuda')
+        matmul_op = get_blas_op(alpha, beta, transA, transB)
+    elif dtype_config == 'tf32':
+        # TF32模式 - 支持addmm
+        torch.backends.cuda.matmul.allow_tf32 = True
+        torch.backends.cudnn.allow_tf32 = True
+        a = torch.randn((a_rows, a_cols), dtype=torch.float32, device='cuda')
+        b = torch.randn((b_rows, b_cols), dtype=torch.float32, device='cuda')
+        c = torch.zeros((M, N), dtype=torch.float32, device='cuda')
+        matmul_op = get_blas_op(alpha, beta, transA, transB)
+    elif dtype_config == 'int8':
+        # 纯int8模式 - 不支持addmm，单独实现
+        a = torch.randint(-128, 127, (a_rows, a_cols), dtype=torch.int8, device='cuda')
+        b = torch.randint(-128, 127, (b_rows, b_cols), dtype=torch.int8, device='cuda')
+        c = torch.zeros((M, N), dtype=torch.int8, device='cuda')
+        def matmul_op(a, b, c):
+            a_op = a.t() if transA else a
+            b_op = b.t() if transB else b
+            result = torch.mm(a_op.float(), b_op.float()).to(torch.int8)
+            if alpha != 1.0 or beta != 0.0:
+                result = (alpha * result.float()).to(torch.int8) + beta * c
+            c.copy_(result)
+            return c
+    else:
+        # 标准精度模式 - 支持addmm，使用高性能实现
+        dtype_map = {
+            'float64': torch.float64,
+            'float32': torch.float32,
+            'float16': torch.float16,
+            'bfloat16': torch.bfloat16,
+        }
+        dtype = dtype_map.get(dtype_config, torch.float32)
+        a = torch.randn((a_rows, a_cols), dtype=dtype, device='cuda')
+        b = torch.randn((b_rows, b_cols), dtype=dtype, device='cuda')
+        c = torch.zeros((M, N), dtype=dtype, device='cuda')
+        matmul_op = get_blas_op(alpha, beta, transA, transB)
+    # Warmup
+    for _ in range(args.warmup_iterations):
+        matmul_op(a, b, c)
+    # 同步确保warmup完成
+    torch.cuda.synchronize()
+    # 计时
+    start_event = torch.cuda.Event(enable_timing=True)
+    end_event = torch.cuda.Event(enable_timing=True)
+    start_event.record()
+    for _ in range(args.bench_iterations):
+        matmul_op(a, b, c)
+    end_event.record()
+    end_event.synchronize()
+    latency_ms = start_event.elapsed_time(end_event)
+    avg_latency_us = latency_ms * 1e3 / args.bench_iterations
+    # 计算FLOPs: 2*M*N*K (乘法+加法)
+    total_flops = 2 * M * N * K
+    tflops = total_flops / (avg_latency_us * 1e-6) / 1e12
+    return avg_latency_us, tflops, True
+def main():
+    args = parse_args()
+    print(f"\n{'='*80}")
+    print(f"GEMM Benchmark")
+    print(f"Matrix Size: [{args.M}, {args.K}] x [{args.K}, {args.N}]")
+    if args.transA:
+        print(f"Transpose A: Yes (actual A shape: [{args.K}, {args.M}])")
+    if args.transB:
+        print(f"Transpose B: Yes (actual B shape: [{args.N}, {args.K}])")
+    print(f"Alpha: {args.alpha}, Beta: {args.beta}")
+    print(f"Data Type: {args.dtype}")
+    print(f"{'='*80}")
+    try:
+        avg_latency_us, tflops, success = benchmark_gemm(args, args.dtype)
+        if success:
+            print(f"\nResults:")
+            print(f"  Warmup iterations: {args.warmup_iterations}")
+            print(f"  Benchmark iterations: {args.bench_iterations}")
+            print(f"  Average latency: {avg_latency_us:.3f} μs")
+            print(f"  Performance: {tflops:.3f} TFLOPS")
+        else:
+            print(f"\nBenchmark failed for {args.dtype}")
+    except Exception as e:
+        print(f"\nError: {str(e)}")
+        import traceback
+        traceback.print_exc()
+        print(f"  Benchmark failed for {args.dtype}")
+if __name__ == "__main__":
+    main()
\ No newline at end of file
--- a/hipblaslt.config
+++ b/hipblaslt.config
+Git Version: 4bd05bb5-dirty
+    transA,transB,grouped_gemm,batch_count,m,n,k,alpha,lda,stride_a,beta,ldb,stride_b,ldc,stride_c,ldd,stride_d,a_type,b_type,c_type,d_type,compute_type,scaleA,scaleB,scaleC,scaleD,amaxD,activation_type,bias_vector,bias_type,hipblaslt-Gflops,us,solution_index,gcnArchName,CUs
+    T,N,0,1,128,128,128,1,128,16384,0,128,16384,128,16384,128,16384,i8_r,i8_r,i32_r,f16_r,i32_r,2,2,0,0,0,,1,f16_r,624.226,6.71921,4452,gfx936:sramecc+:xnack-,80
+    transA,transB,grouped_gemm,batch_count,m,n,k,alpha,lda,stride_a,beta,ldb,stride_b,ldc,stride_c,ldd,stride_d,a_type,b_type,c_type,d_type,compute_type,scaleA,scaleB,scaleC,scaleD,amaxD,activation_type,bias_vector,bias_type,hipblaslt-Gflops,us,solution_index,gcnArchName,CUs
+    T,N,0,1,256,256,256,1,256,65536,0,256,65536,256,65536,256,65536,i8_r,i8_r,i32_r,f16_r,i32_r,2,2,0,0,0,,1,f16_r,4403.76,7.6195,4452,gfx936:sramecc+:xnack-,80
+    transA,transB,grouped_gemm,batch_count,m,n,k,alpha,lda,stride_a,beta,ldb,stride_b,ldc,stride_c,ldd,stride_d,a_type,b_type,c_type,d_type,compute_type,scaleA,scaleB,scaleC,scaleD,amaxD,activation_type,bias_vector,bias_type,hipblaslt-Gflops,us,solution_index,gcnArchName,CUs
+    T,N,0,1,512,512,512,1,512,262144,0,512,262144,512,262144,512,262144,i8_r,i8_r,i32_r,f16_r,i32_r,2,2,0,0,0,,1,f16_r,26791.3,10.0195,4500,gfx936:sramecc+:xnack-,80
+    transA,transB,grouped_gemm,batch_count,m,n,k,alpha,lda,stride_a,beta,ldb,stride_b,ldc,stride_c,ldd,stride_d,a_type,b_type,c_type,d_type,compute_type,scaleA,scaleB,scaleC,scaleD,amaxD,activation_type,bias_vector,bias_type,hipblaslt-Gflops,us,solution_index,gcnArchName,CUs
+    T,N,0,1,1024,1024,1024,1,1024,1048576,0,1024,1048576,1024,1048576,1024,1048576,i8_r,i8_r,i32_r,f16_r,i32_r,2,2,0,0,0,,1,f16_r,108893,19.721,4456,gfx936:sramecc+:xnack-,80
+    transA,transB,grouped_gemm,batch_count,m,n,k,alpha,lda,stride_a,beta,ldb,stride_b,ldc,stride_c,ldd,stride_d,a_type,b_type,c_type,d_type,compute_type,scaleA,scaleB,scaleC,scaleD,amaxD,activation_type,bias_vector,bias_type,hipblaslt-Gflops,us,solution_index,gcnArchName,CUs
+    T,N,0,1,2048,2048,2048,1,2048,4194304,0,2048,4194304,2048,4194304,2048,4194304,i8_r,i8_r,i32_r,f16_r,i32_r,2,2,0,0,0,,1,f16_r,315119,54.5186,4477,gfx936:sramecc+:xnack-,80
+    transA,transB,grouped_gemm,batch_count,m,n,k,alpha,lda,stride_a,beta,ldb,stride_b,ldc,stride_c,ldd,stride_d,a_type,b_type,c_type,d_type,compute_type,scaleA,scaleB,scaleC,scaleD,amaxD,activation_type,bias_vector,bias_type,hipblaslt-Gflops,us,solution_index,gcnArchName,CUs
+    T,N,0,1,4096,4096,4096,1,4096,16777216,0,4096,16777216,4096,16777216,4096,16777216,i8_r,i8_r,i32_r,f16_r,i32_r,2,2,0,0,0,,1,f16_r,475043,289.319,4478,gfx936:sramecc+:xnack-,80
+    transA,transB,grouped_gemm,batch_count,m,n,k,alpha,lda,stride_a,beta,ldb,stride_b,ldc,stride_c,ldd,stride_d,a_type,b_type,c_type,d_type,compute_type,scaleA,scaleB,scaleC,scaleD,amaxD,activation_type,bias_vector,bias_type,hipblaslt-Gflops,us,solution_index,gcnArchName,CUs
+    T,N,0,1,8192,8192,8192,1,8192,67108864,0,8192,67108864,8192,67108864,8192,67108864,i8_r,i8_r,i32_r,f16_r,i32_r,2,2,0,0,0,,1,f16_r,674829,1629.32,4479,gfx936:sramecc+:xnack-,80
+    transA,transB,grouped_gemm,batch_count,m,n,k,alpha,lda,stride_a,beta,ldb,stride_b,ldc,stride_c,ldd,stride_d,a_type,b_type,c_type,d_type,compute_type,scaleA,scaleB,scaleC,scaleD,amaxD,activation_type,bias_vector,bias_type,hipblaslt-Gflops,us,solution_index,gcnArchName,CUs
+    T,N,0,1,4098,4098,4098,1,4098,16793604,0,4098,16793604,4098,16793604,4098,16793604,i8_r,i8_r,i32_r,f16_r,i32_r,2,2,0,0,0,,1,f16_r,209556,656.819,4478,gfx936:sramecc+:xnack-,80
+    transA,transB,grouped_gemm,batch_count,m,n,k,alpha,lda,stride_a,beta,ldb,stride_b,ldc,stride_c,ldd,stride_d,a_type,b_type,c_type,d_type,compute_type,scaleA,scaleB,scaleC,scaleD,amaxD,activation_type,bias_vector,bias_type,hipblaslt-Gflops,us,solution_index,gcnArchName,CUs
+    T,N,0,1,8190,8190,8190,1,8190,67076100,0,8190,67076100,8190,67076100,8190,67076100,i8_r,i8_r,i32_r,f16_r,i32_r,2,2,0,0,0,,1,f16_r,266242,4126.72,4479,gfx936:sramecc+:xnack-,80
+    transA,transB,grouped_gemm,batch_count,m,n,k,alpha,lda,stride_a,beta,ldb,stride_b,ldc,stride_c,ldd,stride_d,a_type,b_type,c_type,d_type,compute_type,scaleA,scaleB,scaleC,scaleD,amaxD,activation_type,bias_vector,bias_type,hipblaslt-Gflops,us,solution_index,gcnArchName,CUs
+    T,N,0,1,8192,8192,768,1,8192,67108864,0,768,6291456,8192,67108864,8192,67108864,i8_r,i8_r,i32_r,f16_r,i32_r,2,2,0,0,0,,1,f16_r,277453,371.519,4484,gfx936:sramecc+:xnack-,80
--- a/library_gpu5.tar.gz
+++ b/library_gpu5.tar.gz
--- a/run_gemm_benchmarks_torch.sh
+++ b/run_gemm_benchmarks_torch.sh
+#!/bin/bash
+# GEMM Benchmark Shell Script
+# 测试各种矩阵形状和数据类型的GEMM性能
+export ROCBLAS_TENSILE_LIBPATH=/opt/dtk-26.04/lib/rocblas/auto_select_test/auto_select_tools/optimization_configs/new/config/library_gpu5/
+export HIP_VISIBLE_DEVICES=3
+set -e
+# 颜色输出
+RED='\033[0;31m'
+GREEN='\033[0;32m'
+YELLOW='\033[1;33m'
+BLUE='\033[0;34m'
+NC='\033[0m' # No Color
+# 日志文件
+LOG_FILE="gemm_benchmark_$(date +%Y%m%d_%H%M%S).log"
+CSV_FILE="gemm_benchmark_results_$(date +%Y%m%d_%H%M%S).csv"
+# Python脚本路径
+PYTHON_SCRIPT="gemm_benchmark.py"
+# 默认参数
+WARMUP_ITER=50
+BENCH_ITER=1000
+# 打印带颜色的信息
+print_info() {
+    echo -e "${BLUE}[INFO]${NC} $1" | tee -a "$LOG_FILE"
+}
+print_success() {
+    echo -e "${GREEN}[SUCCESS]${NC} $1" | tee -a "$LOG_FILE"
+}
+print_error() {
+    echo -e "${RED}[ERROR]${NC} $1" | tee -a "$LOG_FILE"
+}
+print_section() {
+    echo -e "\n${YELLOW}========================================${NC}" | tee -a "$LOG_FILE"
+    echo -e "${YELLOW}$1${NC}" | tee -a "$LOG_FILE"
+    echo -e "${YELLOW}========================================${NC}" | tee -a "$LOG_FILE"
+}
+# 初始化CSV文件
+init_csv() {
+    echo "Shape_M,Shape_K,Shape_N,DataType,Latency_us,TFLOPS,Status" > "$CSV_FILE"
+}
+# 执行单个测试
+run_test() {
+    local M=$1
+    local K=$2
+    local N=$3
+    local dtype=$4
+    print_info "Testing: M=$M, K=$K, N=$N, dtype=$dtype"
+    # 执行Python脚本并捕获输出
+    output=$(python3 "$PYTHON_SCRIPT" \
+        --M "$M" \
+        --K "$K" \
+        --N "$N" \
+        --dtype "$dtype" \
+        --warmup_iterations "$WARMUP_ITER" \
+        --bench_iterations "$BENCH_ITER" --transA 2>&1)
+    # 提取延迟和TFLOPS
+    latency=$(echo "$output" | grep "Average latency:" | awk '{print $3}')
+    tflops=$(echo "$output" | grep "Performance:" | awk '{print $2}')
+    if [ -n "$latency" ] && [ -n "$tflops" ]; then
+        print_success "  Latency: ${latency} μs, TFLOPS: ${tflops}"
+        echo "$M,$K,$N,$dtype,$latency,$tflops,SUCCESS" >> "$CSV_FILE"
+    else
+        print_error "  Test failed for $dtype"
+        echo "$M,$K,$N,$dtype,0,0,FAILED" >> "$CSV_FILE"
+    fi
+    echo "" >> "$LOG_FILE"
+}
+# 测试场景1: M=K=N (2的幂次)
+test_power_of_two() {
+    print_section "Test Case 1: Square matrices (power of 2)"
+    local sizes=(128 256 512 1024 2048 4096 8192)
+    for size in "${sizes[@]}"; do
+        print_info "Testing square matrix: $size x $size"
+        for dtype in "${dtypes[@]}"; do
+            run_test "$size" "$size" "$size" "$dtype"
+        done
+    done
+}
+# 测试场景2: M=K=N (非对齐)
+test_non_aligned() {
+    print_section "Test Case 2: Square matrices (non-aligned)"
+    local sizes=(4098 8190)
+    for size in "${sizes[@]}"; do
+        print_info "Testing square matrix: $size x $size"
+        for dtype in "${dtypes[@]}"; do
+            run_test "$size" "$size" "$size" "$dtype"
+        done
+    done
+}
+# 测试场景3: 特定形状
+test_specific_shape() {
+    print_section "Test Case 3: Specific shape (M=8192, K=768, N=8192)"
+    run_test 8192 768 8192 "float64"
+    run_test 8192 768 8192 "float32"
+    run_test 8192 768 8192 "float16"
+    run_test 8192 768 8192 "bfloat16"
+    run_test 8192 768 8192 "tf32"
+    run_test 8192 768 8192 "mixed_fp16_fp32"
+    run_test 8192 768 8192 "mixed_bf16_fp32"
+    run_test 8192 768 8192 "mixed_int8_int32"
+    run_test 8192 768 8192 "mixed_tf32_fp32"
+    run_test 8192 768 8192 "w8a8"
+}
+# 检查CUDA是否可用
+check_cuda() {
+    print_info "Checking CUDA availability..."
+    if python3 -c "import torch; assert torch.cuda.is_available()" 2>/dev/null; then
+        cuda_version=$(python3 -c "import torch; print(torch.version.cuda)")
+        gpu_name=$(python3 -c "import torch; print(torch.cuda.get_device_name(0))")
+        print_success "CUDA available: $cuda_version"
+        print_success "GPU: $gpu_name"
+        echo "CUDA Version: $cuda_version" >> "$LOG_FILE"
+        echo "GPU: $gpu_name" >> "$LOG_FILE"
+    else
+        print_error "CUDA not available. Exiting."
+        exit 1
+    fi
+}
+# 主函数
+main() {
+    print_info "Starting GEMM Benchmark Suite"
+    print_info "Log file: $LOG_FILE"
+    print_info "Results CSV: $CSV_FILE"
+    # 检查CUDA
+    check_cuda
+    # 初始化CSV
+    init_csv
+    # 记录系统信息
+    echo "System Information:" >> "$LOG_FILE"
+    echo "Date: $(date)" >> "$LOG_FILE"
+    echo "Hostname: $(hostname)" >> "$LOG_FILE"
+    echo "Python: $(which python3)" >> "$LOG_FILE"
+    echo "PyTorch: $(python3 -c 'import torch; print(torch.__version__)')" >> "$LOG_FILE"
+    echo "" >> "$LOG_FILE"
+    # 定义要测试的数据类型
+    dtypes=(
+        "float64"
+        "float32"
+        "float16"
+        "bfloat16"
+        "tf32"
+        "mixed_fp16_fp32"
+        "mixed_bf16_fp32"
+        "mixed_int8_int32"
+        "mixed_tf32_fp32"
+        "w8a8"
+    )
+    # 执行测试
+    test_power_of_two
+    test_non_aligned
+    test_specific_shape
+    print_section "Benchmark Complete"
+    print_success "All tests finished. Results saved to $CSV_FILE"
+    print_info "Log saved to $LOG_FILE"
+    # 显示结果摘要
+    echo -e "\n${GREEN}Results Summary:${NC}"
+    echo "========================================="
+    echo "CSV file: $CSV_FILE"
+    echo "Log file: $LOG_FILE"
+    echo ""
+    echo "To view results:"
+    echo "  cat $CSV_FILE"
+    echo "  or use a spreadsheet application"
+}
+# 运行主函数
+main
\ No newline at end of file
--- a/run_hipblaslt_bench.sh
+++ b/run_hipblaslt_bench.sh
+#!/bin/bash
+# ROCm BLAS 性能测试脚本
+# 测试多种矩阵形状和数据类型的GEMM性能
+# 设置使用的GPU设备
+export HIP_VISIBLE_DEVICES=7
+chmod +x ${ROCM_PATH}/bin/hipblaslt-bench
+HIPBLASLT_TUNING_OVERRIDE_FILE=hipblaslt.config 
+# 输出文件
+OUTPUT_CSV="hipblaslt_bench_results.csv"
+# 写入CSV头（包含所有字段）
+echo "shape_type,data_type,M,N,K,alpha,beta,transA,transB,grouped_gemm,batch_count,lda,stride_a,ldb,stride_b,ldc,stride_c,ldd,stride_d,a_type,b_type,c_type,d_type,compute_type,scaleA,scaleB,scaleC,scaleD,amaxD,activation_type,bias_vector,bias_type,hipblaslt-Gflops,us,status" > ${OUTPUT_CSV}
+# 定义测试函数
+run_test() {
+    local shape_type=$1
+    local data_type=$2
+    local M=$3
+    local N=$4
+    local K=$5
+    local alpha=${6}
+    local beta=${7}
+    echo "=========================================="
+    echo "Running test: ${shape_type} - ${data_type} - M=${M} N=${N} K=${K}"
+    # 构建命令
+    local cmd="hipblaslt-bench --api_method c \
+            -m ${M} -n ${N} -k ${K} \
+            --alpha ${alpha} --beta ${beta} \
+            --transA T --transB N \
+            --batch_count 1 \
+            --scaleA 2 --scaleB 2 \
+            --bias_vector \
+            --bias_source d \
+            --a_type i8_r --lda ${M} \
+            --b_type i8_r --lda ${K} \
+            --c_type i32_r --lda ${M} \
+            --d_type f16_r --lda ${M} \
+            --scale_type f32_r \
+            --bias_type f16_r \
+            --compute_type i32_r \
+            --print_kernel_info \
+            --cold_iters 50 --iters 1000"
+    # 运行hipblaslt-bench并捕获输出
+    local output=$(eval ${cmd} 2>&1)
+    # 检查是否有错误
+    if echo "${output}" | grep -q "error:"; then
+        echo "Test failed: ${shape_type} - ${data_type}"
+        echo "${shape_type},${data_type},${M},${N},${K},${alpha},${beta},T,N,0,1,0,0,0,0,0,0,0,0,i8_r,i8_r,i32_r,f16_r,i32_r,2,2,0,0,0,none,1,f16_r,FAILED,FAILED,ERROR" >> ${OUTPUT_CSV}
+        return
+    fi
+    # 提取实际的数据行
+    # 方法1: 查找以 transA 值开头（T或N）的行，且包含足够的逗号分隔符
+    # 数据行格式: T,N,0,1,1024,1024,1024,1,1024,1048576,0,1024,1048576,1024,1048576,1024,1048576,i8_r,i8_r,i32_r,f16_r,i32_r,2,2,0,0,0,none,1,f16_r,138933,15.457
+    local data_line=""
+    # 先尝试查找以 T, 或 N, 开头的行（数据行的特征）
+    data_line=$(echo "${output}" | grep -E "^(T|N),.*,.*,.*,.*,.*,.*,.*,.*,.*,.*,.*,.*,.*,.*,.*,.*,.*,.*,.*,.*,.*,.*,.*,.*,.*,.*,.*,.*,.*,[0-9]+\.[0-9]+,[0-9]+\.[0-9]+$" | tail -1)
+    # 如果没找到，尝试查找包含 hipblaslt-Gflops 的行
+    if [ -z "${data_line}" ]; then
+        data_line=$(echo "${output}" | grep "hipblaslt-Gflops" | tail -1)
+        # 如果找到了，取下一行（实际数据）
+        if [ -n "${data_line}" ]; then
+            data_line=$(echo "${output}" | grep -A1 "hipblaslt-Gflops" | tail -1)
+        fi
+    fi
+    # 如果还没找到，尝试查找包含数字结尾且以 T或N 开头的行
+    if [ -z "${data_line}" ]; then
+        data_line=$(echo "${output}" | grep -E "^(T|N)," | grep -E "[0-9]+\.[0-9]+,[0-9]+\.[0-9]+$" | tail -1)
+    fi
+    # 最后的尝试：查找包含逗号且不以 [ 或 - 或空格开头的行
+    if [ -z "${data_line}" ]; then
+        data_line=$(echo "${output}" | grep -v "^\[" | grep -v "^-" | grep -v "^[[:space:]]*$" | grep "," | grep -E "[0-9]+\.[0-9]+,[0-9]+\.[0-9]+$" | tail -1)
+    fi
+    # 清理行首尾空格
+    data_line=$(echo "${data_line}" | sed 's/^[[:space:]]*//;s/[[:space:]]*$//')
+    if [ -n "${data_line}" ] && echo "${data_line}" | grep -q ","; then
+        # 验证数据行格式：应该以 T, 或 N, 开头，并且包含至少30个逗号
+        local comma_count=$(echo "${data_line}" | tr -cd ',' | wc -c)
+        if [ ${comma_count} -lt 30 ]; then
+            echo "Warning: Data line has only ${comma_count} commas, might be incomplete"
+            echo "Data line: ${data_line}"
+        fi
+        # 解析数据行的所有字段
+        # 字段索引（从1开始）：
+        # 1:transA, 2:transB, 3:grouped_gemm, 4:batch_count, 5:m, 6:n, 7:k, 
+        # 8:alpha, 9:lda, 10:stride_a, 11:beta, 12:ldb, 13:stride_b, 
+        # 14:ldc, 15:stride_c, 16:ldd, 17:stride_d, 
+        # 18:a_type, 19:b_type, 20:c_type, 21:d_type, 22:compute_type,
+        # 23:scaleA, 24:scaleB, 25:scaleC, 26:scaleD, 27:amaxD, 
+        # 28:activation_type, 29:bias_vector, 30:bias_type,
+        # 31:hipblaslt-Gflops, 32:us
+        local transA=$(echo "${data_line}" | awk -F',' '{print $1}')
+        local transB=$(echo "${data_line}" | awk -F',' '{print $2}')
+        local grouped_gemm=$(echo "${data_line}" | awk -F',' '{print $3}')
+        local batch_count=$(echo "${data_line}" | awk -F',' '{print $4}')
+        # 根据实际字段数量动态提取
+        local field_count=$(echo "${data_line}" | awk -F',' '{print NF}')
+        if [ ${field_count} -ge 32 ]; then
+            local lda=$(echo "${data_line}" | awk -F',' '{print $9}')
+            local stride_a=$(echo "${data_line}" | awk -F',' '{print $10}')
+            local ldb=$(echo "${data_line}" | awk -F',' '{print $12}')
+            local stride_b=$(echo "${data_line}" | awk -F',' '{print $13}')
+            local ldc=$(echo "${data_line}" | awk -F',' '{print $14}')
+            local stride_c=$(echo "${data_line}" | awk -F',' '{print $15}')
+            local ldd=$(echo "${data_line}" | awk -F',' '{print $16}')
+            local stride_d=$(echo "${data_line}" | awk -F',' '{print $17}')
+            local a_type=$(echo "${data_line}" | awk -F',' '{print $18}')
+            local b_type=$(echo "${data_line}" | awk -F',' '{print $19}')
+            local c_type=$(echo "${data_line}" | awk -F',' '{print $20}')
+            local d_type=$(echo "${data_line}" | awk -F',' '{print $21}')
+            local compute_type=$(echo "${data_line}" | awk -F',' '{print $22}')
+            local scaleA=$(echo "${data_line}" | awk -F',' '{print $23}')
+            local scaleB=$(echo "${data_line}" | awk -F',' '{print $24}')
+            local scaleC=$(echo "${data_line}" | awk -F',' '{print $25}')
+            local scaleD=$(echo "${data_line}" | awk -F',' '{print $26}')
+            local amaxD=$(echo "${data_line}" | awk -F',' '{print $27}')
+            local activation_type=$(echo "${data_line}" | awk -F',' '{print $28}')
+            local bias_vector=$(echo "${data_line}" | awk -F',' '{print $29}')
+            local bias_type=$(echo "${data_line}" | awk -F',' '{print $30}')
+            local gflops=$(echo "${data_line}" | awk -F',' '{print $31}')
+            local us=$(echo "${data_line}" | awk -F',' '{print $32}')
+        else
+            # 如果字段数不够，尝试从末尾提取Gflops和us
+            local gflops=$(echo "${data_line}" | awk -F',' '{print $(NF-1)}')
+            local us=$(echo "${data_line}" | awk -F',' '{print $NF}')
+            # 设置默认值
+            lda=0; stride_a=0; ldb=0; stride_b=0; ldc=0; stride_c=0; ldd=0; stride_d=0
+            a_type="i8_r"; b_type="i8_r"; c_type="i32_r"; d_type="f16_r"
+            compute_type="i32_r"; scaleA=2; scaleB=2; scaleC=0; scaleD=0
+            amaxD=0; activation_type="none"; bias_vector=1; bias_type="f16_r"
+        fi
+        # 清理可能的前后空格
+        gflops=$(echo "${gflops}" | sed 's/^[[:space:]]*//;s/[[:space:]]*$//')
+        us=$(echo "${us}" | sed 's/^[[:space:]]*//;s/[[:space:]]*$//')
+        echo "Result: ${gflops} GFLOPS, ${us} us"
+        # 写入CSV
+        echo "${shape_type},${data_type},${M},${N},${K},${alpha},${beta},${transA},${transB},${grouped_gemm},${batch_count},${lda},${stride_a},${ldb},${stride_b},${ldc},${stride_c},${ldd},${stride_d},${a_type},${b_type},${c_type},${d_type},${compute_type},${scaleA},${scaleB},${scaleC},${scaleD},${amaxD},${activation_type},${bias_vector},${bias_type},${gflops},${us},SUCCESS" >> ${OUTPUT_CSV}
+    else
+        echo "Test failed to produce valid output: ${shape_type} - ${data_type}"
+        echo "Raw output (last 20 lines):"
+        echo "${output}" | tail -20
+        echo "${shape_type},${data_type},${M},${N},${K},${alpha},${beta},T,N,0,1,0,0,0,0,0,0,0,0,i8_r,i8_r,i32_r,f16_r,i32_r,2,2,0,0,0,none,1,f16_r,FAILED,FAILED,NO_OUTPUT" >> ${OUTPUT_CSV}
+    fi
+}
+# 测试场景3a: M,K,N同时相等的情况
+for size in 128 256 512 1024 2048 4096 8192; do
+    run_test "square" "w8a8" ${size} ${size} ${size} 1.0 0.0
+done
+# 测试场景3b: 非4对齐的情况
+for size in 4098 8190; do
+    run_test "square" "w8a8" ${size} ${size} ${size} 1.0 0.0
+done
+# 测试场景3c: M=8192, K=768, N=8192
+run_test "square" "w8a8" 8192 8192 768 1.0 0.0
+echo "=========================================="
+echo "All tests completed. Results saved to ${OUTPUT_CSV}"
\ No newline at end of file
--- a/run_rocbals_bench.sh
+++ b/run_rocbals_bench.sh
+#!/bin/bash
+# ROCm BLAS 性能测试脚本
+# 测试多种矩阵形状和数据类型的GEMM性能
+# 输出文件
+export ROCBLAS_TENSILE_LIBPATH=./library_gpu5
+export HIP_VISIBLE_DEVICES=3
+chmod +x ${ROCM_PATH}/bin/rocblas-bench
+OUTPUT_CSV="rocblas_bench_results_transB.csv"
+# 写入CSV头
+echo "shape_type,data_type,M,N,K,alpha,beta,transA,transB,a_type,b_type,c_type,d_type,compute_type,rocblas-Gflops,us" > ${OUTPUT_CSV}
+# 定义测试函数
+run_test() {
+    local shape_type=$1
+    local data_type=$2
+    local M=$3
+    local N=$4
+    local K=$5
+    local a_type=$6
+    local b_type=$7
+    local c_type=$8
+    local d_type=$9
+    local compute_type=${10}
+    local alpha=${11}
+    local beta=${12}
+    local math_mode=${13}
+    echo "Running test: ${shape_type} - ${data_type} - M=${M} N=${N} K=${K}"
+    # 构建命令
+    local cmd="rocblas-bench -f gemm_ex \
+        -m ${M} -n ${N} -k ${K} \
+        --alpha ${alpha} \
+        --transposeA N --transposeB T \
+        --a_type ${a_type} --lda ${M} \
+        --b_type ${b_type} --ldb ${K} \
+        --beta ${beta} \
+        --c_type ${c_type} --ldc ${M} \
+        --d_type ${d_type} --ldd ${M} \
+        --compute_type ${compute_type} \
+        --cold_iters 50 --iters 1000"
+    # 如果指定了math_mode，则添加
+    if [ -n "${math_mode}" ]; then
+        cmd="${cmd} --math_mode ${math_mode}"
+    fi
+    # 运行rocblas-bench并捕获输出
+    local output=$(eval ${cmd} 2>/dev/null | tail -1)
+    # 解析输出
+    if [ -n "${output}" ]; then
+        # 提取rocblas-Gflops和us列
+        local gflops=$(echo ${output} | awk '{print $(NF-1)}')
+        local us=$(echo ${output} | awk '{print $NF}')
+        echo "${shape_type},${data_type},${M},${N},${K},${alpha},${beta},N,N,${a_type},${b_type},${c_type},${d_type},${compute_type},${gflops},${us}" >> ${OUTPUT_CSV}
+    else
+        echo "Test failed or not supported: ${shape_type} - ${data_type}"
+        echo "${shape_type},${data_type},${M},${N},${K},${alpha},${beta},N,N,${a_type},${b_type},${c_type},${d_type},${compute_type},FAILED,FAILED" >> ${OUTPUT_CSV}
+    fi
+}
+# 测试场景3a: M,K,N同时相等的情况
+for size in 128 256 512 1024 2048 4096 8192; do
+    # fp64
+    run_test "square" "fp64" ${size} ${size} ${size} "f64_r" "f64_r" "f64_r" "f64_r" "f64_r" 1.0 0.0 ""
+    # fp32
+    run_test "square" "fp32" ${size} ${size} ${size} "f32_r" "f32_r" "f32_r" "f32_r" "f32_r" 1.0 0.0 ""
+    # fp16
+    run_test "square" "fp16" ${size} ${size} ${size} "f16_r" "f16_r" "f16_r" "f16_r" "f32_r" 1.0 0.0 ""
+    # bf16
+    run_test "square" "bf16" ${size} ${size} ${size} "bf16_r" "bf16_r" "bf16_r" "bf16_r" "f32_r" 1.0 0.0 ""
+    # tf32 - 使用f32_r类型和math_mode=1
+    run_test "square" "tf32" ${size} ${size} ${size} "f32_r" "f32_r" "f32_r" "f32_r" "f32_r" 1.0 0.0 "1"
+done
+# 测试场景3b: 非4对齐的情况
+for size in 4098 8190; do
+    # fp64
+    run_test "non_aligned" "fp64" ${size} ${size} ${size} "f64_r" "f64_r" "f64_r" "f64_r" "f64_r" 1.0 0.0 ""
+    # fp32
+    run_test "non_aligned" "fp32" ${size} ${size} ${size} "f32_r" "f32_r" "f32_r" "f32_r" "f32_r" 1.0 0.0 ""
+    # fp16
+    run_test "non_aligned" "fp16" ${size} ${size} ${size} "f16_r" "f16_r" "f16_r" "f16_r" "f32_r" 1.0 0.0 ""
+    # bf16
+    run_test "non_aligned" "bf16" ${size} ${size} ${size} "bf16_r" "bf16_r" "bf16_r" "bf16_r" "f32_r" 1.0 0.0 ""
+    # tf32 - 使用f32_r类型和math_mode=1
+    run_test "non_aligned" "tf32" ${size} ${size} ${size} "f32_r" "f32_r" "f32_r" "f32_r" "f32_r" 1.0 0.0 "1"
+done
+# 测试场景3c: M=8192, K=768, N=8192
+run_test "special" "fp64" 8192 8192 768 "f64_r" "f64_r" "f64_r" "f64_r" "f64_r" 1.0 0.0 ""
+run_test "special" "fp32" 8192 8192 768 "f32_r" "f32_r" "f32_r" "f32_r" "f32_r" 1.0 0.0 ""
+run_test "special" "fp16" 8192 8192 768 "f16_r" "f16_r" "f16_r" "f16_r" "f32_r" 1.0 0.0 ""
+run_test "special" "bf16" 8192 8192 768 "bf16_r" "bf16_r" "bf16_r" "bf16_r" "f32_r" 1.0 0.0 ""
+run_test "special" "tf32" 8192 8192 768 "f32_r" "f32_r" "f32_r" "f32_r" "f32_r" 1.0 0.0 "1"
+# 混合精度测试
+for size in 128 256 512 1024 2048 4096 8192; do
+    # fp16_fp32
+    run_test "mixed" "fp16_fp32" ${size} ${size} ${size} "f16_r" "f16_r" "f32_r" "f32_r" "f32_r" 1.0 0.0 ""
+    # bf16_fp32
+    run_test "mixed" "bf16_fp32" ${size} ${size} ${size} "bf16_r" "bf16_r" "f32_r" "f32_r" "f32_r" 1.0 0.0 ""
+    # int8_int32
+    run_test "mixed" "int8_int32" ${size} ${size} ${size} "i8_r" "i8_r" "i32_r" "i32_r" "i32_r" 1.0 0.0 ""
+done
+for size in 4098 8190; do
+    # fp16_fp32
+    run_test "mixed" "fp16_fp32" ${size} ${size} ${size} "f16_r" "f16_r" "f32_r" "f32_r" "f32_r" 1.0 0.0 ""
+    # bf16_fp32
+    run_test "mixed" "bf16_fp32" ${size} ${size} ${size} "bf16_r" "bf16_r" "f32_r" "f32_r" "f32_r" 1.0 0.0 ""
+    # int8_int32
+    run_test "mixed" "int8_int32" ${size} ${size} ${size} "i8_r" "i8_r" "i32_r" "i32_r" "i32_r" 1.0 0.0 ""
+done
+# 场景5b: A,B: fp16, C,D: fp32
+run_test "mixed" "fp16_fp32" 4096 4096 4096 "f16_r" "f16_r" "f32_r" "f32_r" "f32_r" 1.0 0.0 ""
+run_test "mixed" "fp16_fp32" 8192 8192 8192 "f16_r" "f16_r" "f32_r" "f32_r" "f32_r" 1.0 0.0 ""
+run_test "mixed" "fp16_fp32" 8192 8192 768 "f16_r" "f16_r" "f32_r" "f32_r" "f32_r" 1.0 0.0 ""
+# 场景5c: A,B: bf16, C,D: fp32
+run_test "mixed" "bf16_fp32" 4096 4096 4096 "bf16_r" "bf16_r" "f32_r" "f32_r" "f32_r" 1.0 0.0 ""
+run_test "mixed" "bf16_fp32" 8192 8192 8192 "bf16_r" "bf16_r" "f32_r" "f32_r" "f32_r" 1.0 0.0 ""
+run_test "mixed" "bf16_fp32" 8192 8192 768 "bf16_r" "bf16_r" "f32_r" "f32_r" "f32_r" 1.0 0.0 ""
+# 场景5e: A,B: int8, C,D: int32
+run_test "mixed" "int8_int32" 4096 4096 4096 "i8_r" "i8_r" "i32_r" "i32_r" "i32_r" 1.0 0.0 ""
+run_test "mixed" "int8_int32" 8192 8192 8192 "i8_r" "i8_r" "i32_r" "i32_r" "i32_r" 1.0 0.0 ""
+run_test "mixed" "int8_int32" 8192 8192 768 "i8_r" "i8_r" "i32_r" "i32_r" "i32_r" 1.0 0.0 ""