Commit 4a4927e5 authored by sunzhq2's avatar sunzhq2
Browse files

init gemm-test

parent d51e625f
# GEMM-test # 镜像使用
```
harbor.sourcefind.cn:5443/dcu/admin/base/vllm:0.11.0-ubuntu22.04-dtk26.04-0130-py3.10-20260204
```
# rocblas-bench
```
rocblas-bench用于测试矩阵乘法,使用前先通过下面的命令配置可执行文件
ln -s ${ROCM_PATH}/lib/rocblas/benchmark_tool/rocblas-bench ${ROCM_PATH}/bin/rocblas-bench
chmod +x ${ROCM_PATH}/bin/rocblas-bench
```
```
tar zxvf library_gpu5.tar.gz -C ./
bash run_rocbals_bench.sh
```
# hipblaslt
```
hipblaslt-bench 用于测试W8A8,使用前先通过下面的命令配置可执行文件
ln -s ${ROCM_PATH}/lib/hipblaslt/benchmark_tool/hipblaslt-bench ${ROCM_PATH}/bin/hipblaslt-bench
chmod +x ${ROCM_PATH}/bin/hipblaslt-bench
```
```
bash run_rocbals_bench.sh
```
# torch_gemm 测试
- run_gemm_benchmarks_torch.sh 和 gemm_benchmark.py 是用 torch 实现的gemm测试
- bash run_gemm_benchmarks_torch.sh
\ No newline at end of file
import torch
import argparse
import time
import numpy as np
def parse_args():
parser = argparse.ArgumentParser(description='gemm benchmark')
parser.add_argument('--M', type=int, default=4096, help='M')
parser.add_argument('--K', type=int, default=4096, help='K')
parser.add_argument('--N', type=int, default=4096, help='N')
parser.add_argument('--dtype', type=str, default='bfloat16',
choices=['float64', 'float32', 'float16', 'bfloat16', 'tf32',
'int8', 'mixed_fp16_fp32', 'mixed_bf16_fp32',
'mixed_int8_int32', 'mixed_tf32_fp32', 'w8a8'],
help='测试数据类型')
parser.add_argument('--alpha', type=float, default=1.0, help='alpha')
parser.add_argument('--beta', type=float, default=0.0, help='beta')
parser.add_argument('--warmup_iterations', type=int, default=50, help='warmup次数')
parser.add_argument('--bench_iterations', type=int, default=1000, help='benchmark迭代次数')
parser.add_argument('--transA', action='store_true', default=False,
help='是否转置A矩阵')
parser.add_argument('--transB', action='store_true', default=False,
help='是否转置B矩阵')
return parser.parse_args()
def get_matrix(dims, dtype, device='cuda'):
"""创建指定类型的矩阵"""
if dtype in [torch.float64, torch.float32, torch.float16, torch.bfloat16]:
return torch.randn(dims, dtype=dtype, device=device)
elif dtype == torch.int8:
return torch.randint(-128, 127, dims, dtype=torch.int8, device=device)
else:
return torch.randn(dims, dtype=torch.float32, device=device)
def get_blas_op(alpha, beta, transA=False, transB=False):
"""返回BLAS操作函数"""
def blas_op(a, b, c):
# torch.addmm 不支持直接指定转置,需要手动转置
a_op = a.t() if transA else a
b_op = b.t() if transB else b
return torch.addmm(c, a_op, b_op, beta=beta, alpha=alpha)
return blas_op
def benchmark_gemm(args, dtype_config):
"""执行GEMM基准测试"""
M, K, N = args.M, args.K, args.N
alpha, beta = args.alpha, args.beta
transA, transB = args.transA, args.transB
# 根据转置标志确定实际矩阵维度
a_rows, a_cols = (K, M) if transA else (M, K)
b_rows, b_cols = (N, K) if transB else (K, N)
# 解析数据类型配置
if dtype_config == 'mixed_fp16_fp32':
# A,B: fp16, C: fp32 - 不支持addmm,单独实现
a = torch.randn((a_rows, a_cols), dtype=torch.float16, device='cuda')
b = torch.randn((b_rows, b_cols), dtype=torch.float16, device='cuda')
c = torch.zeros((M, N), dtype=torch.float32, device='cuda')
def matmul_op(a, b, c):
a_op = a.t() if transA else a
b_op = b.t() if transB else b
result = torch.mm(a_op, b_op) # 自动提升到fp32
if alpha != 1.0 or beta != 0.0:
result = alpha * result + beta * c
c.copy_(result)
return c
elif dtype_config == 'mixed_bf16_fp32':
# A,B: bf16, C: fp32 - 不支持addmm,单独实现
a = torch.randn((a_rows, a_cols), dtype=torch.bfloat16, device='cuda')
b = torch.randn((b_rows, b_cols), dtype=torch.bfloat16, device='cuda')
c = torch.zeros((M, N), dtype=torch.float32, device='cuda')
def matmul_op(a, b, c):
a_op = a.t() if transA else a
b_op = b.t() if transB else b
result = torch.mm(a_op, b_op) # 自动提升到fp32
if alpha != 1.0 or beta != 0.0:
result = alpha * result + beta * c
c.copy_(result)
return c
elif dtype_config == 'mixed_int8_int32':
# A,B: int8, C: int32 - 不支持addmm,单独实现
a = torch.randint(-128, 127, (a_rows, a_cols), dtype=torch.int8, device='cuda')
b = torch.randint(-128, 127, (b_rows, b_cols), dtype=torch.int8, device='cuda')
c = torch.zeros((M, N), dtype=torch.int32, device='cuda')
if hasattr(torch, '_int_mm'):
print(" Using torch._int_mm for int8 matmul")
def matmul_op(a, b, c):
a_op = a.t() if transA else a
b_op = b.t() if transB else b
result = torch._int_mm(a_op, b_op)
if alpha != 1.0 or beta != 0.0:
result = (alpha * result.float()).to(torch.int32) + beta * c
c.copy_(result)
return c
else:
print(" Warning: torch._int_mm not available, using fallback")
def matmul_op(a, b, c):
a_op = a.t() if transA else a
b_op = b.t() if transB else b
result = torch.mm(a_op.float(), b_op.float()).to(torch.int32)
if alpha != 1.0 or beta != 0.0:
result = (alpha * result.float()).to(torch.int32) + beta * c
c.copy_(result)
return c
elif dtype_config == 'w8a8':
# W8A8: 权重int8, 激活fp16 - 不支持addmm,单独实现
a = torch.randn((a_rows, a_cols), dtype=torch.float16, device='cuda')
b = torch.randint(-128, 127, (b_rows, b_cols), dtype=torch.int8, device='cuda')
c = torch.zeros((M, N), dtype=torch.float16, device='cuda')
def matmul_op(a, b, c):
a_op = a.t() if transA else a
b_op = b.t() if transB else b
b_fp16 = b_op.to(torch.float16)
result = torch.mm(a_op, b_fp16)
if alpha != 1.0 or beta != 0.0:
result = alpha * result + beta * c
c.copy_(result)
return c
elif dtype_config == 'mixed_tf32_fp32':
# TF32模式 - 支持addmm
torch.backends.cuda.matmul.allow_tf32 = True
torch.backends.cudnn.allow_tf32 = True
a = torch.randn((a_rows, a_cols), dtype=torch.float32, device='cuda')
b = torch.randn((b_rows, b_cols), dtype=torch.float32, device='cuda')
c = torch.zeros((M, N), dtype=torch.float32, device='cuda')
matmul_op = get_blas_op(alpha, beta, transA, transB)
elif dtype_config == 'tf32':
# TF32模式 - 支持addmm
torch.backends.cuda.matmul.allow_tf32 = True
torch.backends.cudnn.allow_tf32 = True
a = torch.randn((a_rows, a_cols), dtype=torch.float32, device='cuda')
b = torch.randn((b_rows, b_cols), dtype=torch.float32, device='cuda')
c = torch.zeros((M, N), dtype=torch.float32, device='cuda')
matmul_op = get_blas_op(alpha, beta, transA, transB)
elif dtype_config == 'int8':
# 纯int8模式 - 不支持addmm,单独实现
a = torch.randint(-128, 127, (a_rows, a_cols), dtype=torch.int8, device='cuda')
b = torch.randint(-128, 127, (b_rows, b_cols), dtype=torch.int8, device='cuda')
c = torch.zeros((M, N), dtype=torch.int8, device='cuda')
def matmul_op(a, b, c):
a_op = a.t() if transA else a
b_op = b.t() if transB else b
result = torch.mm(a_op.float(), b_op.float()).to(torch.int8)
if alpha != 1.0 or beta != 0.0:
result = (alpha * result.float()).to(torch.int8) + beta * c
c.copy_(result)
return c
else:
# 标准精度模式 - 支持addmm,使用高性能实现
dtype_map = {
'float64': torch.float64,
'float32': torch.float32,
'float16': torch.float16,
'bfloat16': torch.bfloat16,
}
dtype = dtype_map.get(dtype_config, torch.float32)
a = torch.randn((a_rows, a_cols), dtype=dtype, device='cuda')
b = torch.randn((b_rows, b_cols), dtype=dtype, device='cuda')
c = torch.zeros((M, N), dtype=dtype, device='cuda')
matmul_op = get_blas_op(alpha, beta, transA, transB)
# Warmup
for _ in range(args.warmup_iterations):
matmul_op(a, b, c)
# 同步确保warmup完成
torch.cuda.synchronize()
# 计时
start_event = torch.cuda.Event(enable_timing=True)
end_event = torch.cuda.Event(enable_timing=True)
start_event.record()
for _ in range(args.bench_iterations):
matmul_op(a, b, c)
end_event.record()
end_event.synchronize()
latency_ms = start_event.elapsed_time(end_event)
avg_latency_us = latency_ms * 1e3 / args.bench_iterations
# 计算FLOPs: 2*M*N*K (乘法+加法)
total_flops = 2 * M * N * K
tflops = total_flops / (avg_latency_us * 1e-6) / 1e12
return avg_latency_us, tflops, True
def main():
args = parse_args()
print(f"\n{'='*80}")
print(f"GEMM Benchmark")
print(f"Matrix Size: [{args.M}, {args.K}] x [{args.K}, {args.N}]")
if args.transA:
print(f"Transpose A: Yes (actual A shape: [{args.K}, {args.M}])")
if args.transB:
print(f"Transpose B: Yes (actual B shape: [{args.N}, {args.K}])")
print(f"Alpha: {args.alpha}, Beta: {args.beta}")
print(f"Data Type: {args.dtype}")
print(f"{'='*80}")
try:
avg_latency_us, tflops, success = benchmark_gemm(args, args.dtype)
if success:
print(f"\nResults:")
print(f" Warmup iterations: {args.warmup_iterations}")
print(f" Benchmark iterations: {args.bench_iterations}")
print(f" Average latency: {avg_latency_us:.3f} μs")
print(f" Performance: {tflops:.3f} TFLOPS")
else:
print(f"\nBenchmark failed for {args.dtype}")
except Exception as e:
print(f"\nError: {str(e)}")
import traceback
traceback.print_exc()
print(f" Benchmark failed for {args.dtype}")
if __name__ == "__main__":
main()
\ No newline at end of file
Git Version: 4bd05bb5-dirty
transA,transB,grouped_gemm,batch_count,m,n,k,alpha,lda,stride_a,beta,ldb,stride_b,ldc,stride_c,ldd,stride_d,a_type,b_type,c_type,d_type,compute_type,scaleA,scaleB,scaleC,scaleD,amaxD,activation_type,bias_vector,bias_type,hipblaslt-Gflops,us,solution_index,gcnArchName,CUs
T,N,0,1,128,128,128,1,128,16384,0,128,16384,128,16384,128,16384,i8_r,i8_r,i32_r,f16_r,i32_r,2,2,0,0,0,,1,f16_r,624.226,6.71921,4452,gfx936:sramecc+:xnack-,80
transA,transB,grouped_gemm,batch_count,m,n,k,alpha,lda,stride_a,beta,ldb,stride_b,ldc,stride_c,ldd,stride_d,a_type,b_type,c_type,d_type,compute_type,scaleA,scaleB,scaleC,scaleD,amaxD,activation_type,bias_vector,bias_type,hipblaslt-Gflops,us,solution_index,gcnArchName,CUs
T,N,0,1,256,256,256,1,256,65536,0,256,65536,256,65536,256,65536,i8_r,i8_r,i32_r,f16_r,i32_r,2,2,0,0,0,,1,f16_r,4403.76,7.6195,4452,gfx936:sramecc+:xnack-,80
transA,transB,grouped_gemm,batch_count,m,n,k,alpha,lda,stride_a,beta,ldb,stride_b,ldc,stride_c,ldd,stride_d,a_type,b_type,c_type,d_type,compute_type,scaleA,scaleB,scaleC,scaleD,amaxD,activation_type,bias_vector,bias_type,hipblaslt-Gflops,us,solution_index,gcnArchName,CUs
T,N,0,1,512,512,512,1,512,262144,0,512,262144,512,262144,512,262144,i8_r,i8_r,i32_r,f16_r,i32_r,2,2,0,0,0,,1,f16_r,26791.3,10.0195,4500,gfx936:sramecc+:xnack-,80
transA,transB,grouped_gemm,batch_count,m,n,k,alpha,lda,stride_a,beta,ldb,stride_b,ldc,stride_c,ldd,stride_d,a_type,b_type,c_type,d_type,compute_type,scaleA,scaleB,scaleC,scaleD,amaxD,activation_type,bias_vector,bias_type,hipblaslt-Gflops,us,solution_index,gcnArchName,CUs
T,N,0,1,1024,1024,1024,1,1024,1048576,0,1024,1048576,1024,1048576,1024,1048576,i8_r,i8_r,i32_r,f16_r,i32_r,2,2,0,0,0,,1,f16_r,108893,19.721,4456,gfx936:sramecc+:xnack-,80
transA,transB,grouped_gemm,batch_count,m,n,k,alpha,lda,stride_a,beta,ldb,stride_b,ldc,stride_c,ldd,stride_d,a_type,b_type,c_type,d_type,compute_type,scaleA,scaleB,scaleC,scaleD,amaxD,activation_type,bias_vector,bias_type,hipblaslt-Gflops,us,solution_index,gcnArchName,CUs
T,N,0,1,2048,2048,2048,1,2048,4194304,0,2048,4194304,2048,4194304,2048,4194304,i8_r,i8_r,i32_r,f16_r,i32_r,2,2,0,0,0,,1,f16_r,315119,54.5186,4477,gfx936:sramecc+:xnack-,80
transA,transB,grouped_gemm,batch_count,m,n,k,alpha,lda,stride_a,beta,ldb,stride_b,ldc,stride_c,ldd,stride_d,a_type,b_type,c_type,d_type,compute_type,scaleA,scaleB,scaleC,scaleD,amaxD,activation_type,bias_vector,bias_type,hipblaslt-Gflops,us,solution_index,gcnArchName,CUs
T,N,0,1,4096,4096,4096,1,4096,16777216,0,4096,16777216,4096,16777216,4096,16777216,i8_r,i8_r,i32_r,f16_r,i32_r,2,2,0,0,0,,1,f16_r,475043,289.319,4478,gfx936:sramecc+:xnack-,80
transA,transB,grouped_gemm,batch_count,m,n,k,alpha,lda,stride_a,beta,ldb,stride_b,ldc,stride_c,ldd,stride_d,a_type,b_type,c_type,d_type,compute_type,scaleA,scaleB,scaleC,scaleD,amaxD,activation_type,bias_vector,bias_type,hipblaslt-Gflops,us,solution_index,gcnArchName,CUs
T,N,0,1,8192,8192,8192,1,8192,67108864,0,8192,67108864,8192,67108864,8192,67108864,i8_r,i8_r,i32_r,f16_r,i32_r,2,2,0,0,0,,1,f16_r,674829,1629.32,4479,gfx936:sramecc+:xnack-,80
transA,transB,grouped_gemm,batch_count,m,n,k,alpha,lda,stride_a,beta,ldb,stride_b,ldc,stride_c,ldd,stride_d,a_type,b_type,c_type,d_type,compute_type,scaleA,scaleB,scaleC,scaleD,amaxD,activation_type,bias_vector,bias_type,hipblaslt-Gflops,us,solution_index,gcnArchName,CUs
T,N,0,1,4098,4098,4098,1,4098,16793604,0,4098,16793604,4098,16793604,4098,16793604,i8_r,i8_r,i32_r,f16_r,i32_r,2,2,0,0,0,,1,f16_r,209556,656.819,4478,gfx936:sramecc+:xnack-,80
transA,transB,grouped_gemm,batch_count,m,n,k,alpha,lda,stride_a,beta,ldb,stride_b,ldc,stride_c,ldd,stride_d,a_type,b_type,c_type,d_type,compute_type,scaleA,scaleB,scaleC,scaleD,amaxD,activation_type,bias_vector,bias_type,hipblaslt-Gflops,us,solution_index,gcnArchName,CUs
T,N,0,1,8190,8190,8190,1,8190,67076100,0,8190,67076100,8190,67076100,8190,67076100,i8_r,i8_r,i32_r,f16_r,i32_r,2,2,0,0,0,,1,f16_r,266242,4126.72,4479,gfx936:sramecc+:xnack-,80
transA,transB,grouped_gemm,batch_count,m,n,k,alpha,lda,stride_a,beta,ldb,stride_b,ldc,stride_c,ldd,stride_d,a_type,b_type,c_type,d_type,compute_type,scaleA,scaleB,scaleC,scaleD,amaxD,activation_type,bias_vector,bias_type,hipblaslt-Gflops,us,solution_index,gcnArchName,CUs
T,N,0,1,8192,8192,768,1,8192,67108864,0,768,6291456,8192,67108864,8192,67108864,i8_r,i8_r,i32_r,f16_r,i32_r,2,2,0,0,0,,1,f16_r,277453,371.519,4484,gfx936:sramecc+:xnack-,80
#!/bin/bash
# GEMM Benchmark Shell Script
# 测试各种矩阵形状和数据类型的GEMM性能
export ROCBLAS_TENSILE_LIBPATH=/opt/dtk-26.04/lib/rocblas/auto_select_test/auto_select_tools/optimization_configs/new/config/library_gpu5/
export HIP_VISIBLE_DEVICES=3
set -e
# 颜色输出
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
BLUE='\033[0;34m'
NC='\033[0m' # No Color
# 日志文件
LOG_FILE="gemm_benchmark_$(date +%Y%m%d_%H%M%S).log"
CSV_FILE="gemm_benchmark_results_$(date +%Y%m%d_%H%M%S).csv"
# Python脚本路径
PYTHON_SCRIPT="gemm_benchmark.py"
# 默认参数
WARMUP_ITER=50
BENCH_ITER=1000
# 打印带颜色的信息
print_info() {
echo -e "${BLUE}[INFO]${NC} $1" | tee -a "$LOG_FILE"
}
print_success() {
echo -e "${GREEN}[SUCCESS]${NC} $1" | tee -a "$LOG_FILE"
}
print_error() {
echo -e "${RED}[ERROR]${NC} $1" | tee -a "$LOG_FILE"
}
print_section() {
echo -e "\n${YELLOW}========================================${NC}" | tee -a "$LOG_FILE"
echo -e "${YELLOW}$1${NC}" | tee -a "$LOG_FILE"
echo -e "${YELLOW}========================================${NC}" | tee -a "$LOG_FILE"
}
# 初始化CSV文件
init_csv() {
echo "Shape_M,Shape_K,Shape_N,DataType,Latency_us,TFLOPS,Status" > "$CSV_FILE"
}
# 执行单个测试
run_test() {
local M=$1
local K=$2
local N=$3
local dtype=$4
print_info "Testing: M=$M, K=$K, N=$N, dtype=$dtype"
# 执行Python脚本并捕获输出
output=$(python3 "$PYTHON_SCRIPT" \
--M "$M" \
--K "$K" \
--N "$N" \
--dtype "$dtype" \
--warmup_iterations "$WARMUP_ITER" \
--bench_iterations "$BENCH_ITER" --transA 2>&1)
# 提取延迟和TFLOPS
latency=$(echo "$output" | grep "Average latency:" | awk '{print $3}')
tflops=$(echo "$output" | grep "Performance:" | awk '{print $2}')
if [ -n "$latency" ] && [ -n "$tflops" ]; then
print_success " Latency: ${latency} μs, TFLOPS: ${tflops}"
echo "$M,$K,$N,$dtype,$latency,$tflops,SUCCESS" >> "$CSV_FILE"
else
print_error " Test failed for $dtype"
echo "$M,$K,$N,$dtype,0,0,FAILED" >> "$CSV_FILE"
fi
echo "" >> "$LOG_FILE"
}
# 测试场景1: M=K=N (2的幂次)
test_power_of_two() {
print_section "Test Case 1: Square matrices (power of 2)"
local sizes=(128 256 512 1024 2048 4096 8192)
for size in "${sizes[@]}"; do
print_info "Testing square matrix: $size x $size"
for dtype in "${dtypes[@]}"; do
run_test "$size" "$size" "$size" "$dtype"
done
done
}
# 测试场景2: M=K=N (非对齐)
test_non_aligned() {
print_section "Test Case 2: Square matrices (non-aligned)"
local sizes=(4098 8190)
for size in "${sizes[@]}"; do
print_info "Testing square matrix: $size x $size"
for dtype in "${dtypes[@]}"; do
run_test "$size" "$size" "$size" "$dtype"
done
done
}
# 测试场景3: 特定形状
test_specific_shape() {
print_section "Test Case 3: Specific shape (M=8192, K=768, N=8192)"
run_test 8192 768 8192 "float64"
run_test 8192 768 8192 "float32"
run_test 8192 768 8192 "float16"
run_test 8192 768 8192 "bfloat16"
run_test 8192 768 8192 "tf32"
run_test 8192 768 8192 "mixed_fp16_fp32"
run_test 8192 768 8192 "mixed_bf16_fp32"
run_test 8192 768 8192 "mixed_int8_int32"
run_test 8192 768 8192 "mixed_tf32_fp32"
run_test 8192 768 8192 "w8a8"
}
# 检查CUDA是否可用
check_cuda() {
print_info "Checking CUDA availability..."
if python3 -c "import torch; assert torch.cuda.is_available()" 2>/dev/null; then
cuda_version=$(python3 -c "import torch; print(torch.version.cuda)")
gpu_name=$(python3 -c "import torch; print(torch.cuda.get_device_name(0))")
print_success "CUDA available: $cuda_version"
print_success "GPU: $gpu_name"
echo "CUDA Version: $cuda_version" >> "$LOG_FILE"
echo "GPU: $gpu_name" >> "$LOG_FILE"
else
print_error "CUDA not available. Exiting."
exit 1
fi
}
# 主函数
main() {
print_info "Starting GEMM Benchmark Suite"
print_info "Log file: $LOG_FILE"
print_info "Results CSV: $CSV_FILE"
# 检查CUDA
check_cuda
# 初始化CSV
init_csv
# 记录系统信息
echo "System Information:" >> "$LOG_FILE"
echo "Date: $(date)" >> "$LOG_FILE"
echo "Hostname: $(hostname)" >> "$LOG_FILE"
echo "Python: $(which python3)" >> "$LOG_FILE"
echo "PyTorch: $(python3 -c 'import torch; print(torch.__version__)')" >> "$LOG_FILE"
echo "" >> "$LOG_FILE"
# 定义要测试的数据类型
dtypes=(
"float64"
"float32"
"float16"
"bfloat16"
"tf32"
"mixed_fp16_fp32"
"mixed_bf16_fp32"
"mixed_int8_int32"
"mixed_tf32_fp32"
"w8a8"
)
# 执行测试
test_power_of_two
test_non_aligned
test_specific_shape
print_section "Benchmark Complete"
print_success "All tests finished. Results saved to $CSV_FILE"
print_info "Log saved to $LOG_FILE"
# 显示结果摘要
echo -e "\n${GREEN}Results Summary:${NC}"
echo "========================================="
echo "CSV file: $CSV_FILE"
echo "Log file: $LOG_FILE"
echo ""
echo "To view results:"
echo " cat $CSV_FILE"
echo " or use a spreadsheet application"
}
# 运行主函数
main
\ No newline at end of file
#!/bin/bash
# ROCm BLAS 性能测试脚本
# 测试多种矩阵形状和数据类型的GEMM性能
# 设置使用的GPU设备
export HIP_VISIBLE_DEVICES=7
chmod +x ${ROCM_PATH}/bin/hipblaslt-bench
HIPBLASLT_TUNING_OVERRIDE_FILE=hipblaslt.config
# 输出文件
OUTPUT_CSV="hipblaslt_bench_results.csv"
# 写入CSV头(包含所有字段)
echo "shape_type,data_type,M,N,K,alpha,beta,transA,transB,grouped_gemm,batch_count,lda,stride_a,ldb,stride_b,ldc,stride_c,ldd,stride_d,a_type,b_type,c_type,d_type,compute_type,scaleA,scaleB,scaleC,scaleD,amaxD,activation_type,bias_vector,bias_type,hipblaslt-Gflops,us,status" > ${OUTPUT_CSV}
# 定义测试函数
run_test() {
local shape_type=$1
local data_type=$2
local M=$3
local N=$4
local K=$5
local alpha=${6}
local beta=${7}
echo "=========================================="
echo "Running test: ${shape_type} - ${data_type} - M=${M} N=${N} K=${K}"
# 构建命令
local cmd="hipblaslt-bench --api_method c \
-m ${M} -n ${N} -k ${K} \
--alpha ${alpha} --beta ${beta} \
--transA T --transB N \
--batch_count 1 \
--scaleA 2 --scaleB 2 \
--bias_vector \
--bias_source d \
--a_type i8_r --lda ${M} \
--b_type i8_r --lda ${K} \
--c_type i32_r --lda ${M} \
--d_type f16_r --lda ${M} \
--scale_type f32_r \
--bias_type f16_r \
--compute_type i32_r \
--print_kernel_info \
--cold_iters 50 --iters 1000"
# 运行hipblaslt-bench并捕获输出
local output=$(eval ${cmd} 2>&1)
# 检查是否有错误
if echo "${output}" | grep -q "error:"; then
echo "Test failed: ${shape_type} - ${data_type}"
echo "${shape_type},${data_type},${M},${N},${K},${alpha},${beta},T,N,0,1,0,0,0,0,0,0,0,0,i8_r,i8_r,i32_r,f16_r,i32_r,2,2,0,0,0,none,1,f16_r,FAILED,FAILED,ERROR" >> ${OUTPUT_CSV}
return
fi
# 提取实际的数据行
# 方法1: 查找以 transA 值开头(T或N)的行,且包含足够的逗号分隔符
# 数据行格式: T,N,0,1,1024,1024,1024,1,1024,1048576,0,1024,1048576,1024,1048576,1024,1048576,i8_r,i8_r,i32_r,f16_r,i32_r,2,2,0,0,0,none,1,f16_r,138933,15.457
local data_line=""
# 先尝试查找以 T, 或 N, 开头的行(数据行的特征)
data_line=$(echo "${output}" | grep -E "^(T|N),.*,.*,.*,.*,.*,.*,.*,.*,.*,.*,.*,.*,.*,.*,.*,.*,.*,.*,.*,.*,.*,.*,.*,.*,.*,.*,.*,.*,.*,[0-9]+\.[0-9]+,[0-9]+\.[0-9]+$" | tail -1)
# 如果没找到,尝试查找包含 hipblaslt-Gflops 的行
if [ -z "${data_line}" ]; then
data_line=$(echo "${output}" | grep "hipblaslt-Gflops" | tail -1)
# 如果找到了,取下一行(实际数据)
if [ -n "${data_line}" ]; then
data_line=$(echo "${output}" | grep -A1 "hipblaslt-Gflops" | tail -1)
fi
fi
# 如果还没找到,尝试查找包含数字结尾且以 T或N 开头的行
if [ -z "${data_line}" ]; then
data_line=$(echo "${output}" | grep -E "^(T|N)," | grep -E "[0-9]+\.[0-9]+,[0-9]+\.[0-9]+$" | tail -1)
fi
# 最后的尝试:查找包含逗号且不以 [ 或 - 或空格开头的行
if [ -z "${data_line}" ]; then
data_line=$(echo "${output}" | grep -v "^\[" | grep -v "^-" | grep -v "^[[:space:]]*$" | grep "," | grep -E "[0-9]+\.[0-9]+,[0-9]+\.[0-9]+$" | tail -1)
fi
# 清理行首尾空格
data_line=$(echo "${data_line}" | sed 's/^[[:space:]]*//;s/[[:space:]]*$//')
if [ -n "${data_line}" ] && echo "${data_line}" | grep -q ","; then
# 验证数据行格式:应该以 T, 或 N, 开头,并且包含至少30个逗号
local comma_count=$(echo "${data_line}" | tr -cd ',' | wc -c)
if [ ${comma_count} -lt 30 ]; then
echo "Warning: Data line has only ${comma_count} commas, might be incomplete"
echo "Data line: ${data_line}"
fi
# 解析数据行的所有字段
# 字段索引(从1开始):
# 1:transA, 2:transB, 3:grouped_gemm, 4:batch_count, 5:m, 6:n, 7:k,
# 8:alpha, 9:lda, 10:stride_a, 11:beta, 12:ldb, 13:stride_b,
# 14:ldc, 15:stride_c, 16:ldd, 17:stride_d,
# 18:a_type, 19:b_type, 20:c_type, 21:d_type, 22:compute_type,
# 23:scaleA, 24:scaleB, 25:scaleC, 26:scaleD, 27:amaxD,
# 28:activation_type, 29:bias_vector, 30:bias_type,
# 31:hipblaslt-Gflops, 32:us
local transA=$(echo "${data_line}" | awk -F',' '{print $1}')
local transB=$(echo "${data_line}" | awk -F',' '{print $2}')
local grouped_gemm=$(echo "${data_line}" | awk -F',' '{print $3}')
local batch_count=$(echo "${data_line}" | awk -F',' '{print $4}')
# 根据实际字段数量动态提取
local field_count=$(echo "${data_line}" | awk -F',' '{print NF}')
if [ ${field_count} -ge 32 ]; then
local lda=$(echo "${data_line}" | awk -F',' '{print $9}')
local stride_a=$(echo "${data_line}" | awk -F',' '{print $10}')
local ldb=$(echo "${data_line}" | awk -F',' '{print $12}')
local stride_b=$(echo "${data_line}" | awk -F',' '{print $13}')
local ldc=$(echo "${data_line}" | awk -F',' '{print $14}')
local stride_c=$(echo "${data_line}" | awk -F',' '{print $15}')
local ldd=$(echo "${data_line}" | awk -F',' '{print $16}')
local stride_d=$(echo "${data_line}" | awk -F',' '{print $17}')
local a_type=$(echo "${data_line}" | awk -F',' '{print $18}')
local b_type=$(echo "${data_line}" | awk -F',' '{print $19}')
local c_type=$(echo "${data_line}" | awk -F',' '{print $20}')
local d_type=$(echo "${data_line}" | awk -F',' '{print $21}')
local compute_type=$(echo "${data_line}" | awk -F',' '{print $22}')
local scaleA=$(echo "${data_line}" | awk -F',' '{print $23}')
local scaleB=$(echo "${data_line}" | awk -F',' '{print $24}')
local scaleC=$(echo "${data_line}" | awk -F',' '{print $25}')
local scaleD=$(echo "${data_line}" | awk -F',' '{print $26}')
local amaxD=$(echo "${data_line}" | awk -F',' '{print $27}')
local activation_type=$(echo "${data_line}" | awk -F',' '{print $28}')
local bias_vector=$(echo "${data_line}" | awk -F',' '{print $29}')
local bias_type=$(echo "${data_line}" | awk -F',' '{print $30}')
local gflops=$(echo "${data_line}" | awk -F',' '{print $31}')
local us=$(echo "${data_line}" | awk -F',' '{print $32}')
else
# 如果字段数不够,尝试从末尾提取Gflops和us
local gflops=$(echo "${data_line}" | awk -F',' '{print $(NF-1)}')
local us=$(echo "${data_line}" | awk -F',' '{print $NF}')
# 设置默认值
lda=0; stride_a=0; ldb=0; stride_b=0; ldc=0; stride_c=0; ldd=0; stride_d=0
a_type="i8_r"; b_type="i8_r"; c_type="i32_r"; d_type="f16_r"
compute_type="i32_r"; scaleA=2; scaleB=2; scaleC=0; scaleD=0
amaxD=0; activation_type="none"; bias_vector=1; bias_type="f16_r"
fi
# 清理可能的前后空格
gflops=$(echo "${gflops}" | sed 's/^[[:space:]]*//;s/[[:space:]]*$//')
us=$(echo "${us}" | sed 's/^[[:space:]]*//;s/[[:space:]]*$//')
echo "Result: ${gflops} GFLOPS, ${us} us"
# 写入CSV
echo "${shape_type},${data_type},${M},${N},${K},${alpha},${beta},${transA},${transB},${grouped_gemm},${batch_count},${lda},${stride_a},${ldb},${stride_b},${ldc},${stride_c},${ldd},${stride_d},${a_type},${b_type},${c_type},${d_type},${compute_type},${scaleA},${scaleB},${scaleC},${scaleD},${amaxD},${activation_type},${bias_vector},${bias_type},${gflops},${us},SUCCESS" >> ${OUTPUT_CSV}
else
echo "Test failed to produce valid output: ${shape_type} - ${data_type}"
echo "Raw output (last 20 lines):"
echo "${output}" | tail -20
echo "${shape_type},${data_type},${M},${N},${K},${alpha},${beta},T,N,0,1,0,0,0,0,0,0,0,0,i8_r,i8_r,i32_r,f16_r,i32_r,2,2,0,0,0,none,1,f16_r,FAILED,FAILED,NO_OUTPUT" >> ${OUTPUT_CSV}
fi
}
# 测试场景3a: M,K,N同时相等的情况
for size in 128 256 512 1024 2048 4096 8192; do
run_test "square" "w8a8" ${size} ${size} ${size} 1.0 0.0
done
# 测试场景3b: 非4对齐的情况
for size in 4098 8190; do
run_test "square" "w8a8" ${size} ${size} ${size} 1.0 0.0
done
# 测试场景3c: M=8192, K=768, N=8192
run_test "square" "w8a8" 8192 8192 768 1.0 0.0
echo "=========================================="
echo "All tests completed. Results saved to ${OUTPUT_CSV}"
\ No newline at end of file
#!/bin/bash
# ROCm BLAS 性能测试脚本
# 测试多种矩阵形状和数据类型的GEMM性能
# 输出文件
export ROCBLAS_TENSILE_LIBPATH=./library_gpu5
export HIP_VISIBLE_DEVICES=3
chmod +x ${ROCM_PATH}/bin/rocblas-bench
OUTPUT_CSV="rocblas_bench_results_transB.csv"
# 写入CSV头
echo "shape_type,data_type,M,N,K,alpha,beta,transA,transB,a_type,b_type,c_type,d_type,compute_type,rocblas-Gflops,us" > ${OUTPUT_CSV}
# 定义测试函数
run_test() {
local shape_type=$1
local data_type=$2
local M=$3
local N=$4
local K=$5
local a_type=$6
local b_type=$7
local c_type=$8
local d_type=$9
local compute_type=${10}
local alpha=${11}
local beta=${12}
local math_mode=${13}
echo "Running test: ${shape_type} - ${data_type} - M=${M} N=${N} K=${K}"
# 构建命令
local cmd="rocblas-bench -f gemm_ex \
-m ${M} -n ${N} -k ${K} \
--alpha ${alpha} \
--transposeA N --transposeB T \
--a_type ${a_type} --lda ${M} \
--b_type ${b_type} --ldb ${K} \
--beta ${beta} \
--c_type ${c_type} --ldc ${M} \
--d_type ${d_type} --ldd ${M} \
--compute_type ${compute_type} \
--cold_iters 50 --iters 1000"
# 如果指定了math_mode,则添加
if [ -n "${math_mode}" ]; then
cmd="${cmd} --math_mode ${math_mode}"
fi
# 运行rocblas-bench并捕获输出
local output=$(eval ${cmd} 2>/dev/null | tail -1)
# 解析输出
if [ -n "${output}" ]; then
# 提取rocblas-Gflops和us列
local gflops=$(echo ${output} | awk '{print $(NF-1)}')
local us=$(echo ${output} | awk '{print $NF}')
echo "${shape_type},${data_type},${M},${N},${K},${alpha},${beta},N,N,${a_type},${b_type},${c_type},${d_type},${compute_type},${gflops},${us}" >> ${OUTPUT_CSV}
else
echo "Test failed or not supported: ${shape_type} - ${data_type}"
echo "${shape_type},${data_type},${M},${N},${K},${alpha},${beta},N,N,${a_type},${b_type},${c_type},${d_type},${compute_type},FAILED,FAILED" >> ${OUTPUT_CSV}
fi
}
# 测试场景3a: M,K,N同时相等的情况
for size in 128 256 512 1024 2048 4096 8192; do
# fp64
run_test "square" "fp64" ${size} ${size} ${size} "f64_r" "f64_r" "f64_r" "f64_r" "f64_r" 1.0 0.0 ""
# fp32
run_test "square" "fp32" ${size} ${size} ${size} "f32_r" "f32_r" "f32_r" "f32_r" "f32_r" 1.0 0.0 ""
# fp16
run_test "square" "fp16" ${size} ${size} ${size} "f16_r" "f16_r" "f16_r" "f16_r" "f32_r" 1.0 0.0 ""
# bf16
run_test "square" "bf16" ${size} ${size} ${size} "bf16_r" "bf16_r" "bf16_r" "bf16_r" "f32_r" 1.0 0.0 ""
# tf32 - 使用f32_r类型和math_mode=1
run_test "square" "tf32" ${size} ${size} ${size} "f32_r" "f32_r" "f32_r" "f32_r" "f32_r" 1.0 0.0 "1"
done
# 测试场景3b: 非4对齐的情况
for size in 4098 8190; do
# fp64
run_test "non_aligned" "fp64" ${size} ${size} ${size} "f64_r" "f64_r" "f64_r" "f64_r" "f64_r" 1.0 0.0 ""
# fp32
run_test "non_aligned" "fp32" ${size} ${size} ${size} "f32_r" "f32_r" "f32_r" "f32_r" "f32_r" 1.0 0.0 ""
# fp16
run_test "non_aligned" "fp16" ${size} ${size} ${size} "f16_r" "f16_r" "f16_r" "f16_r" "f32_r" 1.0 0.0 ""
# bf16
run_test "non_aligned" "bf16" ${size} ${size} ${size} "bf16_r" "bf16_r" "bf16_r" "bf16_r" "f32_r" 1.0 0.0 ""
# tf32 - 使用f32_r类型和math_mode=1
run_test "non_aligned" "tf32" ${size} ${size} ${size} "f32_r" "f32_r" "f32_r" "f32_r" "f32_r" 1.0 0.0 "1"
done
# 测试场景3c: M=8192, K=768, N=8192
run_test "special" "fp64" 8192 8192 768 "f64_r" "f64_r" "f64_r" "f64_r" "f64_r" 1.0 0.0 ""
run_test "special" "fp32" 8192 8192 768 "f32_r" "f32_r" "f32_r" "f32_r" "f32_r" 1.0 0.0 ""
run_test "special" "fp16" 8192 8192 768 "f16_r" "f16_r" "f16_r" "f16_r" "f32_r" 1.0 0.0 ""
run_test "special" "bf16" 8192 8192 768 "bf16_r" "bf16_r" "bf16_r" "bf16_r" "f32_r" 1.0 0.0 ""
run_test "special" "tf32" 8192 8192 768 "f32_r" "f32_r" "f32_r" "f32_r" "f32_r" 1.0 0.0 "1"
# 混合精度测试
for size in 128 256 512 1024 2048 4096 8192; do
# fp16_fp32
run_test "mixed" "fp16_fp32" ${size} ${size} ${size} "f16_r" "f16_r" "f32_r" "f32_r" "f32_r" 1.0 0.0 ""
# bf16_fp32
run_test "mixed" "bf16_fp32" ${size} ${size} ${size} "bf16_r" "bf16_r" "f32_r" "f32_r" "f32_r" 1.0 0.0 ""
# int8_int32
run_test "mixed" "int8_int32" ${size} ${size} ${size} "i8_r" "i8_r" "i32_r" "i32_r" "i32_r" 1.0 0.0 ""
done
for size in 4098 8190; do
# fp16_fp32
run_test "mixed" "fp16_fp32" ${size} ${size} ${size} "f16_r" "f16_r" "f32_r" "f32_r" "f32_r" 1.0 0.0 ""
# bf16_fp32
run_test "mixed" "bf16_fp32" ${size} ${size} ${size} "bf16_r" "bf16_r" "f32_r" "f32_r" "f32_r" 1.0 0.0 ""
# int8_int32
run_test "mixed" "int8_int32" ${size} ${size} ${size} "i8_r" "i8_r" "i32_r" "i32_r" "i32_r" 1.0 0.0 ""
done
# 场景5b: A,B: fp16, C,D: fp32
run_test "mixed" "fp16_fp32" 4096 4096 4096 "f16_r" "f16_r" "f32_r" "f32_r" "f32_r" 1.0 0.0 ""
run_test "mixed" "fp16_fp32" 8192 8192 8192 "f16_r" "f16_r" "f32_r" "f32_r" "f32_r" 1.0 0.0 ""
run_test "mixed" "fp16_fp32" 8192 8192 768 "f16_r" "f16_r" "f32_r" "f32_r" "f32_r" 1.0 0.0 ""
# 场景5c: A,B: bf16, C,D: fp32
run_test "mixed" "bf16_fp32" 4096 4096 4096 "bf16_r" "bf16_r" "f32_r" "f32_r" "f32_r" 1.0 0.0 ""
run_test "mixed" "bf16_fp32" 8192 8192 8192 "bf16_r" "bf16_r" "f32_r" "f32_r" "f32_r" 1.0 0.0 ""
run_test "mixed" "bf16_fp32" 8192 8192 768 "bf16_r" "bf16_r" "f32_r" "f32_r" "f32_r" 1.0 0.0 ""
# 场景5e: A,B: int8, C,D: int32
run_test "mixed" "int8_int32" 4096 4096 4096 "i8_r" "i8_r" "i32_r" "i32_r" "i32_r" 1.0 0.0 ""
run_test "mixed" "int8_int32" 8192 8192 8192 "i8_r" "i8_r" "i32_r" "i32_r" "i32_r" 1.0 0.0 ""
run_test "mixed" "int8_int32" 8192 8192 768 "i8_r" "i8_r" "i32_r" "i32_r" "i32_r" 1.0 0.0 ""
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment