__init__.py

# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.

"""A module containing all the micro-benchmarks."""

from superbench.benchmarks.micro_benchmarks.micro_base import MicroBenchmark, MicroBenchmarkWithInvoke
from superbench.benchmarks.micro_benchmarks.gemm_flops_performance_base import GemmFlopsBenchmark
from superbench.benchmarks.micro_benchmarks.memory_bw_performance_base import MemBwBenchmark
from superbench.benchmarks.micro_benchmarks.gpu_hpcg_performance_base import GpuHpcgBenchmark

from superbench.benchmarks.micro_benchmarks.computation_communication_overlap import ComputationCommunicationOverlap
from superbench.benchmarks.micro_benchmarks.cublas_function import CublasBenchmark
from superbench.benchmarks.micro_benchmarks.blaslt_function_base import BlasLtBaseBenchmark
from superbench.benchmarks.micro_benchmarks.cublaslt_function import CublasLtBenchmark
from superbench.benchmarks.micro_benchmarks.rocm_hipblaslt_function import RocmHipBlasLtBenchmark
from superbench.benchmarks.micro_benchmarks.dtk_hipblaslt_function import DtkHipBlasLtBenchmark
from superbench.benchmarks.micro_benchmarks.dtk_gemm_flops_performance import DtkGemmFlopsBenchmark
from superbench.benchmarks.micro_benchmarks.dtk_hpcg_performance import DtkHpcgBenchmark
from superbench.benchmarks.micro_benchmarks.cuda_gemm_flops_performance import CudaGemmFlopsBenchmark
from superbench.benchmarks.micro_benchmarks.cuda_memory_bw_performance import CudaMemBwBenchmark
from superbench.benchmarks.micro_benchmarks.cuda_nccl_bw_performance import CudaNcclBwBenchmark
from superbench.benchmarks.micro_benchmarks.cudnn_function import CudnnBenchmark
from superbench.benchmarks.micro_benchmarks.disk_performance import DiskBenchmark
from superbench.benchmarks.micro_benchmarks.dist_inference import DistInference
from superbench.benchmarks.micro_benchmarks.cpu_memory_bw_latency_performance import CpuMemBwLatencyBenchmark
from superbench.benchmarks.micro_benchmarks.cpu_stream_performance import CpuStreamBenchmark
from superbench.benchmarks.micro_benchmarks.cpu_hpl_performance import CpuHplBenchmark
from superbench.benchmarks.micro_benchmarks.gpcnet_performance import GPCNetBenchmark
from superbench.benchmarks.micro_benchmarks.gpu_copy_bw_performance import GpuCopyBwBenchmark
from superbench.benchmarks.micro_benchmarks.gpu_stream import GpuStreamBenchmark
from superbench.benchmarks.micro_benchmarks.gpu_burn_test import GpuBurnBenchmark
from superbench.benchmarks.micro_benchmarks.ib_loopback_performance import IBLoopbackBenchmark
from superbench.benchmarks.micro_benchmarks.ib_validation_performance import IBBenchmark
from superbench.benchmarks.micro_benchmarks.kernel_launch_overhead import KernelLaunch
from superbench.benchmarks.micro_benchmarks.ort_inference_performance import ORTInferenceBenchmark
from superbench.benchmarks.micro_benchmarks.rocm_gemm_flops_performance import RocmGemmFlopsBenchmark
from superbench.benchmarks.micro_benchmarks.rocm_memory_bw_performance import RocmMemBwBenchmark
from superbench.benchmarks.micro_benchmarks.sharding_matmul import ShardingMatmul
from superbench.benchmarks.micro_benchmarks.tcp_connectivity import TCPConnectivityBenchmark
from superbench.benchmarks.micro_benchmarks.tensorrt_inference_performance import TensorRTInferenceBenchmark
from superbench.benchmarks.micro_benchmarks.directx_gpu_encoding_latency import DirectXGPUEncodingLatency
from superbench.benchmarks.micro_benchmarks.directx_gpu_copy_performance import DirectXGPUCopyBw
from superbench.benchmarks.micro_benchmarks.directx_mem_bw_performance import DirectXGPUMemBw
from superbench.benchmarks.micro_benchmarks.directx_gemm_flops_performance import DirectXGPUCoreFlops
from superbench.benchmarks.micro_benchmarks.nvbandwidth import NvBandwidthBenchmark

__all__ = [
    'BlasLtBaseBenchmark',
    'ComputationCommunicationOverlap',
    'CpuMemBwLatencyBenchmark',
    'CpuHplBenchmark',
    'CpuStreamBenchmark',
    'CublasBenchmark',
    'CublasLtBenchmark',
    'CudaGemmFlopsBenchmark',
    'CudaMemBwBenchmark',
    'CudaNcclBwBenchmark',
    'CudnnBenchmark',
    'DiskBenchmark',
    'DistInference',
    'DtkGemmFlopsBenchmark',
    'RocmHipBlasLtBenchmark',
    'DtkHipBlasLtBenchmark',
    'GPCNetBenchmark',
    'GemmFlopsBenchmark',
    'GpuBurnBenchmark',
    'GpuCopyBwBenchmark',
    'GpuHpcgBenchmark',
    'GpuStreamBenchmark',
    'IBBenchmark',
    'IBLoopbackBenchmark',
    'KernelLaunch',
    'MemBwBenchmark',
    'MicroBenchmark',
    'MicroBenchmarkWithInvoke',
    'ORTInferenceBenchmark',
    'RocmGemmFlopsBenchmark',
    'RocmMemBwBenchmark',
    'ShardingMatmul',
    'TCPConnectivityBenchmark',
    'TensorRTInferenceBenchmark',
    'DirectXGPUEncodingLatency',
    'DirectXGPUCopyBw',
    'DirectXGPUMemBw',
    'DirectXGPUCoreFlops',
    'DtkHpcgBenchmark',
    'NvBandwidthBenchmark',
]