run_gemm_benchmarks_torch.sh 5.38 KB
Newer Older
sunzhq2's avatar
sunzhq2 committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
#!/bin/bash

# GEMM Benchmark Shell Script
# 测试各种矩阵形状和数据类型的GEMM性能
export ROCBLAS_TENSILE_LIBPATH=/opt/dtk-26.04/lib/rocblas/auto_select_test/auto_select_tools/optimization_configs/new/config/library_gpu5/
export HIP_VISIBLE_DEVICES=3

set -e

# 颜色输出
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
BLUE='\033[0;34m'
NC='\033[0m' # No Color

# 日志文件
LOG_FILE="gemm_benchmark_$(date +%Y%m%d_%H%M%S).log"
CSV_FILE="gemm_benchmark_results_$(date +%Y%m%d_%H%M%S).csv"

# Python脚本路径
PYTHON_SCRIPT="gemm_benchmark.py"

# 默认参数
WARMUP_ITER=50
BENCH_ITER=1000

# 打印带颜色的信息
print_info() {
    echo -e "${BLUE}[INFO]${NC} $1" | tee -a "$LOG_FILE"
}

print_success() {
    echo -e "${GREEN}[SUCCESS]${NC} $1" | tee -a "$LOG_FILE"
}

print_error() {
    echo -e "${RED}[ERROR]${NC} $1" | tee -a "$LOG_FILE"
}

print_section() {
    echo -e "\n${YELLOW}========================================${NC}" | tee -a "$LOG_FILE"
    echo -e "${YELLOW}$1${NC}" | tee -a "$LOG_FILE"
    echo -e "${YELLOW}========================================${NC}" | tee -a "$LOG_FILE"
}

# 初始化CSV文件
init_csv() {
    echo "Shape_M,Shape_K,Shape_N,DataType,Latency_us,TFLOPS,Status" > "$CSV_FILE"
}

# 执行单个测试
run_test() {
    local M=$1
    local K=$2
    local N=$3
    local dtype=$4
    
    print_info "Testing: M=$M, K=$K, N=$N, dtype=$dtype"
    
    # 执行Python脚本并捕获输出
    output=$(python3 "$PYTHON_SCRIPT" \
        --M "$M" \
        --K "$K" \
        --N "$N" \
        --dtype "$dtype" \
        --warmup_iterations "$WARMUP_ITER" \
        --bench_iterations "$BENCH_ITER" --transA 2>&1)
    
    # 提取延迟和TFLOPS
    latency=$(echo "$output" | grep "Average latency:" | awk '{print $3}')
    tflops=$(echo "$output" | grep "Performance:" | awk '{print $2}')
    
    if [ -n "$latency" ] && [ -n "$tflops" ]; then
        print_success "  Latency: ${latency} μs, TFLOPS: ${tflops}"
        echo "$M,$K,$N,$dtype,$latency,$tflops,SUCCESS" >> "$CSV_FILE"
    else
        print_error "  Test failed for $dtype"
        echo "$M,$K,$N,$dtype,0,0,FAILED" >> "$CSV_FILE"
    fi
    
    echo "" >> "$LOG_FILE"
}

# 测试场景1: M=K=N (2的幂次)
test_power_of_two() {
    print_section "Test Case 1: Square matrices (power of 2)"
    
    local sizes=(128 256 512 1024 2048 4096 8192)
    
    for size in "${sizes[@]}"; do
        print_info "Testing square matrix: $size x $size"
        for dtype in "${dtypes[@]}"; do
            run_test "$size" "$size" "$size" "$dtype"
        done
    done
}

# 测试场景2: M=K=N (非对齐)
test_non_aligned() {
    print_section "Test Case 2: Square matrices (non-aligned)"
    
    local sizes=(4098 8190)
    
    for size in "${sizes[@]}"; do
        print_info "Testing square matrix: $size x $size"
        for dtype in "${dtypes[@]}"; do
            run_test "$size" "$size" "$size" "$dtype"
        done
    done
}

# 测试场景3: 特定形状
test_specific_shape() {
    print_section "Test Case 3: Specific shape (M=8192, K=768, N=8192)"
    
    run_test 8192 768 8192 "float64"
    run_test 8192 768 8192 "float32"
    run_test 8192 768 8192 "float16"
    run_test 8192 768 8192 "bfloat16"
    run_test 8192 768 8192 "tf32"
    run_test 8192 768 8192 "mixed_fp16_fp32"
    run_test 8192 768 8192 "mixed_bf16_fp32"
    run_test 8192 768 8192 "mixed_int8_int32"
    run_test 8192 768 8192 "mixed_tf32_fp32"
    run_test 8192 768 8192 "w8a8"
}

# 检查CUDA是否可用
check_cuda() {
    print_info "Checking CUDA availability..."
    if python3 -c "import torch; assert torch.cuda.is_available()" 2>/dev/null; then
        cuda_version=$(python3 -c "import torch; print(torch.version.cuda)")
        gpu_name=$(python3 -c "import torch; print(torch.cuda.get_device_name(0))")
        print_success "CUDA available: $cuda_version"
        print_success "GPU: $gpu_name"
        echo "CUDA Version: $cuda_version" >> "$LOG_FILE"
        echo "GPU: $gpu_name" >> "$LOG_FILE"
    else
        print_error "CUDA not available. Exiting."
        exit 1
    fi
}

# 主函数
main() {
    print_info "Starting GEMM Benchmark Suite"
    print_info "Log file: $LOG_FILE"
    print_info "Results CSV: $CSV_FILE"
    
    # 检查CUDA
    check_cuda
    
    # 初始化CSV
    init_csv
    
    # 记录系统信息
    echo "System Information:" >> "$LOG_FILE"
    echo "Date: $(date)" >> "$LOG_FILE"
    echo "Hostname: $(hostname)" >> "$LOG_FILE"
    echo "Python: $(which python3)" >> "$LOG_FILE"
    echo "PyTorch: $(python3 -c 'import torch; print(torch.__version__)')" >> "$LOG_FILE"
    echo "" >> "$LOG_FILE"
    
    # 定义要测试的数据类型
    dtypes=(
        "float64"
        "float32"
        "float16"
        "bfloat16"
        "tf32"
        "mixed_fp16_fp32"
        "mixed_bf16_fp32"
        "mixed_int8_int32"
        "mixed_tf32_fp32"
        "w8a8"
    )
    
    # 执行测试
    test_power_of_two
    test_non_aligned
    test_specific_shape
    
    print_section "Benchmark Complete"
    print_success "All tests finished. Results saved to $CSV_FILE"
    print_info "Log saved to $LOG_FILE"
    
    # 显示结果摘要
    echo -e "\n${GREEN}Results Summary:${NC}"
    echo "========================================="
    echo "CSV file: $CSV_FILE"
    echo "Log file: $LOG_FILE"
    echo ""
    echo "To view results:"
    echo "  cat $CSV_FILE"
    echo "  or use a spreadsheet application"
}

# 运行主函数
main