cublaslt_function.py 2.43 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.

"""Micro benchmark example for cuBLASLt GEMM performance benchmark.

Commands to run:
  python3 examples/benchmarks/cublaslt_function.py
"""

from superbench.benchmarks import BenchmarkRegistry, Platform
from superbench.common.utils import logger

if __name__ == '__main__':
    # Basic usage without autotune
    print('Running cuBLASLt benchmark without autotune...')
    parameters = '--num_warmup 10 --num_steps 50 --shapes 512,512,512 --in_types fp16 fp32'
    context = BenchmarkRegistry.create_benchmark_context('cublaslt-gemm', platform=Platform.CUDA, parameters=parameters)

    benchmark = BenchmarkRegistry.launch_benchmark(context)
    if benchmark:
        logger.info(
            'benchmark: {}, return code: {}, result: {}'.format(
                benchmark.name, benchmark.return_code, benchmark.result
            )
        )

    # Enhanced usage with autotune enabled
    print('\nRunning cuBLASLt benchmark with autotune enabled...')
    parameters_autotune = (
        '--num_warmup 10 --num_steps 50 '
        '--shapes 512,512,512 1024,1024,1024 --in_types fp16 fp32 '
        '--enable_autotune --num_warmup_autotune 20 --num_steps_autotune 50'
    )
    context_autotune = BenchmarkRegistry.create_benchmark_context(
        'cublaslt-gemm', platform=Platform.CUDA, parameters=parameters_autotune
    )

    benchmark_autotune = BenchmarkRegistry.launch_benchmark(context_autotune)
    if benchmark_autotune:
        logger.info(
            'benchmark with autotune: {}, return code: {}, result: {}'.format(
                benchmark_autotune.name, benchmark_autotune.return_code, benchmark_autotune.result
            )
        )

    # FP8 specific usage with autotune
    print('\nRunning cuBLASLt benchmark with FP8 and autotune...')
    parameters_fp8 = (
        '--num_warmup 5 --num_steps 20 '
        '--shapes 512,512,512 --in_types fp8e4m3 fp8e5m2 '
        '--enable_autotune --num_warmup_autotune 10 --num_steps_autotune 30'
    )
    context_fp8 = BenchmarkRegistry.create_benchmark_context(
        'cublaslt-gemm', platform=Platform.CUDA, parameters=parameters_fp8
    )

    benchmark_fp8 = BenchmarkRegistry.launch_benchmark(context_fp8)
    if benchmark_fp8:
        logger.info(
            'FP8 benchmark with autotune: {}, return code: {}, result: {}'.format(
                benchmark_fp8.name, benchmark_fp8.return_code, benchmark_fp8.result
            )
        )