Unverified Commit f6e65a98 authored by Yuting Jiang's avatar Yuting Jiang Committed by GitHub
Browse files

Benchmarks: Micro benchmark - add ncu profile support in cublaslt-gemm (#740)

**Description**
This PR adds NCU (NVIDIA Nsight Compute) profiling support to the
cublaslt-gemm micro benchmark, enabling detailed kernel analysis
including DRAM throughput, compute throughput, and launch arguments.

**Major Revision**
- Add --enable_ncu_profiling and --profiling_metrics for ncu profiling
- Modifies command execution to use NCU when profiling is enabled
- Updates result parsing to handle both standard and NCU profiled output
formats
parent fe234262
......@@ -56,6 +56,20 @@ def add_parser_arguments(self):
required=False,
help='Number of steps to measure for autotune.',
)
self._parser.add_argument(
'--enable_ncu_profiling',
action='store_true',
required=False,
help='Enable ncu profiling for each run.',
)
self._parser.add_argument(
'--profiling_metrics',
type=str,
nargs='+',
default=None,
required=False,
help='List of ncu profiling metrics, support all ncu metrics.',
)
def _preprocess(self):
"""Preprocess/preparation operations before the benchmarking.
......@@ -75,16 +89,17 @@ def _preprocess(self):
f' -a -W {self._args.num_warmup_autotune}'
f' -I {self._args.num_steps_autotune}'
) if self._args.enable_autotune else ''
self._commands.append(
f'{self.__bin_path} -m {_m} -n {_n} -k {_k} -b {_b} '
f'-w {self._args.num_warmup} -i {self._args.num_steps} -t {_in_type}'
command = f'{self.__bin_path} -m {_m} -n {_n} -k {_k} -b {_b} ' + \
f'-w {self._args.num_warmup} -i {self._args.num_steps} -t {_in_type}' + \
f'{(" " + autotune_args) if autotune_args else ""}'
)
if self._args.enable_ncu_profiling:
skip_num = self._args.num_warmup - 1 if self._args.num_warmup > 1 else 0
command = f'ncu --set full --launch-skip {skip_num} --launch-count 1 --csv ' + command
self._commands.append(command)
return True
def _process_raw_result(self, cmd_idx, raw_output):
def _process_raw_result(self, cmd_idx, raw_output): # noqa: C901
"""Function to parse raw results and save the summarized results.
self._result.add_raw_data() and self._result.add_result() need to be called to save the results.
......@@ -99,12 +114,52 @@ def _process_raw_result(self, cmd_idx, raw_output):
self._result.add_raw_data(f'raw_output_{cmd_idx}', raw_output, self._args.log_raw_data)
try:
fields = raw_output.strip().split()
if len(fields) != 6 or not all(x.isdigit() for x in fields[:4]):
raise ValueError('Invalid result.')
self._result.add_result(
f'{self._commands[cmd_idx].split()[-1]}_{fields[3]}_{"_".join(fields[:3])}_flops', float(fields[-1])
)
if not self._args.enable_ncu_profiling:
fields = raw_output.strip().split()
if len(fields) != 6 or not all(x.isdigit() for x in fields[:4]):
raise ValueError('Invalid result.')
self._result.add_result(
f'{self._commands[cmd_idx].split()[-1]}_{fields[3]}_{"_".join(fields[:3])}_flops',
float(fields[-1])
)
else:
lines = raw_output.strip().split('\n')
# find line index of the line that starts with "ID","Process ID"
start_idx = next(i for i, line in enumerate(lines) if 'Metric Name' in line)
if start_idx == 0 or start_idx == len(lines) - 1:
raise ValueError('Invalid result.')
result_lines = lines[0:start_idx - 1]
result = False
size = ''
for line in result_lines:
fields = line.strip().split()
if len(fields) == 6 and all(x.isdigit() for x in fields[:4]):
result = True
size = f'{fields[3]}_{"_".join(fields[:3])}'
self._result.add_result(
f'{self._commands[cmd_idx].split()[-1]}_{fields[3]}_{"_".join(fields[:3])}_flops',
float(fields[-1])
)
if not result:
raise ValueError('Invalid result.')
metric_name_index = lines[start_idx].strip().split(',').index('"Metric Name"')
metric_value_index = lines[start_idx].strip().split(',').index('"Metric Value"')
if metric_name_index < 0 or metric_value_index < 0:
raise ValueError('Can not find Metric Name and Value.')
for line in lines[start_idx + 1:]:
fields = line.strip().split('","')
metric_name = fields[metric_name_index].strip('"').replace(' ', '_')
if len(fields) < 15:
continue
if not self._args.profiling_metrics or metric_name in self._args.profiling_metrics:
value = fields[metric_value_index].strip(',').strip('"')
try:
float_value = float(value)
self._result.add_result(
f'{self._commands[cmd_idx].split()[-1]}_{size}_{metric_name}', float_value
)
except ValueError:
pass
except BaseException as e:
self._result.set_return_code(ReturnCode.MICROBENCHMARK_RESULT_PARSING_FAILURE)
logger.error(
......
......@@ -6,6 +6,7 @@
import unittest
from types import GeneratorType, SimpleNamespace
from tests.helper import decorator
from tests.helper.testcase import BenchmarkTestCase
from superbench.benchmarks import BenchmarkRegistry, BenchmarkType, ReturnCode, Platform
from superbench.benchmarks.result import BenchmarkResult
......@@ -83,11 +84,14 @@ def cmd(t, b, m, n, k):
for _m in [32, 128]:
self.assertIn(cmd(_t, _b, _m, 128, 128), benchmark._commands)
def test_cublaslt_gemm_result_parsing(self):
@decorator.load_data('tests/data/cublaslt_ncu.log')
def test_cublaslt_gemm_result_parsing(self, raw_output):
"""Test cublaslt-gemm benchmark result parsing."""
benchmark = self.get_benchmark()
self.assertTrue(benchmark._preprocess())
benchmark._args = SimpleNamespace(shapes=['16,16,16', '32,64,128'], in_types=['fp8e4m3'], log_raw_data=False)
benchmark._args = SimpleNamespace(
shapes=['16,16,16', '32,64,128'], in_types=['fp8e4m3'], log_raw_data=False, enable_ncu_profiling=False
)
benchmark._result = BenchmarkResult(self.benchmark_name, BenchmarkType.MICRO, ReturnCode.SUCCESS, run_count=1)
# Positive case - valid raw output
......@@ -101,3 +105,15 @@ def test_cublaslt_gemm_result_parsing(self):
# Negative case - invalid raw output
self.assertFalse(benchmark._process_raw_result(1, 'cuBLAS API failed'))
# Positive case - valid ncu raw output
benchmark._args = SimpleNamespace(
shapes=['2208,2048,5608'],
in_types=['fp8e4m3'],
log_raw_data=False,
enable_ncu_profiling=True,
profiling_metrics=['DRAM_Throughput'],
)
benchmark._result = BenchmarkResult(self.benchmark_name, BenchmarkType.MICRO, ReturnCode.SUCCESS, run_count=1)
self.assertTrue(benchmark._process_raw_result(1, raw_output))
self.assertEqual(0.74, benchmark.result['fp8e4m3_0_2208_2048_5608_DRAM_Throughput'][0])
==PROF== Connected to process 371693 (/opt/superbench/bin/cublaslt_gemm)
2208 2048 5608 0 358.154755 141.598150
==PROF== Disconnected from process 371693
"ID","Process ID","Process Name","Host Name","Kernel Name","Context","Stream","Block Size","Grid Size","Device","CC","Section Name","Metric Name","Metric Unit","Metric Value","Rule Name","Rule Type","Rule Description","Estimated Speedup Type","Estimated Speedup"
"0","371693","cublaslt_gemm","127.0.0.1","cutlass3x_sm100_tensorop_s64x256x32gemm_f8_f8_f32_f16_f16_64x256x128_1x1x1_0_tnn_align4_1sm_bias_f16_relu_aux_scalemax","1","7","(384, 1, 1)","(9, 32, 1)","0","10.0","GPU Speed Of Light Throughput","DRAM Frequency","hz","3995313115.40",
"0","371693","cublaslt_gemm","127.0.0.1","cutlass3x_sm100_tensorop_s64x256x32gemm_f8_f8_f32_f16_f16_64x256x128_1x1x1_0_tnn_align4_1sm_bias_f16_relu_aux_scalemax","1","7","(384, 1, 1)","(9, 32, 1)","0","10.0","GPU Speed Of Light Throughput","SM Frequency","hz","1239496049.13",
"0","371693","cublaslt_gemm","127.0.0.1","cutlass3x_sm100_tensorop_s64x256x32gemm_f8_f8_f32_f16_f16_64x256x128_1x1x1_0_tnn_align4_1sm_bias_f16_relu_aux_scalemax","1","7","(384, 1, 1)","(9, 32, 1)","0","10.0","GPU Speed Of Light Throughput","Elapsed Cycles","cycle","508757",
"0","371693","cublaslt_gemm","127.0.0.1","cutlass3x_sm100_tensorop_s64x256x32gemm_f8_f8_f32_f16_f16_64x256x128_1x1x1_0_tnn_align4_1sm_bias_f16_relu_aux_scalemax","1","7","(384, 1, 1)","(9, 32, 1)","0","10.0","GPU Speed Of Light Throughput","Memory Throughput","%","21.68",
"0","371693","cublaslt_gemm","127.0.0.1","cutlass3x_sm100_tensorop_s64x256x32gemm_f8_f8_f32_f16_f16_64x256x128_1x1x1_0_tnn_align4_1sm_bias_f16_relu_aux_scalemax","1","7","(384, 1, 1)","(9, 32, 1)","0","10.0","GPU Speed Of Light Throughput","DRAM Throughput","%","0.74",
"0","371693","cublaslt_gemm","127.0.0.1","cutlass3x_sm100_tensorop_s64x256x32gemm_f8_f8_f32_f16_f16_64x256x128_1x1x1_0_tnn_align4_1sm_bias_f16_relu_aux_scalemax","1","7","(384, 1, 1)","(9, 32, 1)","0","10.0","GPU Speed Of Light Throughput","Duration","ns","406240",
"0","371693","cublaslt_gemm","127.0.0.1","cutlass3x_sm100_tensorop_s64x256x32gemm_f8_f8_f32_f16_f16_64x256x128_1x1x1_0_tnn_align4_1sm_bias_f16_relu_aux_scalemax","1","7","(384, 1, 1)","(9, 32, 1)","0","10.0","GPU Speed Of Light Throughput","L1/TEX Cache Throughput","%","23.23",
"0","371693","cublaslt_gemm","127.0.0.1","cutlass3x_sm100_tensorop_s64x256x32gemm_f8_f8_f32_f16_f16_64x256x128_1x1x1_0_tnn_align4_1sm_bias_f16_relu_aux_scalemax","1","7","(384, 1, 1)","(9, 32, 1)","0","10.0","GPU Speed Of Light Throughput","L2 Cache Throughput","%","15.12",
"0","371693","cublaslt_gemm","127.0.0.1","cutlass3x_sm100_tensorop_s64x256x32gemm_f8_f8_f32_f16_f16_64x256x128_1x1x1_0_tnn_align4_1sm_bias_f16_relu_aux_scalemax","1","7","(384, 1, 1)","(9, 32, 1)","0","10.0","GPU Speed Of Light Throughput","SM Active Cycles","cycle","462061.91",
"0","371693","cublaslt_gemm","127.0.0.1","cutlass3x_sm100_tensorop_s64x256x32gemm_f8_f8_f32_f16_f16_64x256x128_1x1x1_0_tnn_align4_1sm_bias_f16_relu_aux_scalemax","1","7","(384, 1, 1)","(9, 32, 1)","0","10.0","GPU Speed Of Light Throughput","Compute (SM) Throughput","%","23.59",
"0","371693","cublaslt_gemm","127.0.0.1","cutlass3x_sm100_tensorop_s64x256x32gemm_f8_f8_f32_f16_f16_64x256x128_1x1x1_0_tnn_align4_1sm_bias_f16_relu_aux_scalemax","1","7","(384, 1, 1)","(9, 32, 1)","0","10.0","SpeedOfLight","","","","SOLBottleneck","OPT","This workload exhibits low compute throughput and memory bandwidth utilization relative to the peak performance of this device. Achieved compute throughput and/or memory bandwidth below 60.0% of peak typically indicate latency issues. Look at Scheduler Statistics and Warp State Statistics for potential reasons.","",""
"0","371693","cublaslt_gemm","127.0.0.1","cutlass3x_sm100_tensorop_s64x256x32gemm_f8_f8_f32_f16_f16_64x256x128_1x1x1_0_tnn_align4_1sm_bias_f16_relu_aux_scalemax","1","7","(384, 1, 1)","(9, 32, 1)","0","10.0","Launch Statistics","Block Size","","384",
"0","371693","cublaslt_gemm","127.0.0.1","cutlass3x_sm100_tensorop_s64x256x32gemm_f8_f8_f32_f16_f16_64x256x128_1x1x1_0_tnn_align4_1sm_bias_f16_relu_aux_scalemax","1","7","(384, 1, 1)","(9, 32, 1)","0","10.0","Launch Statistics","Cluster Scheduling Policy","","PolicySpread",
"0","371693","cublaslt_gemm","127.0.0.1","cutlass3x_sm100_tensorop_s64x256x32gemm_f8_f8_f32_f16_f16_64x256x128_1x1x1_0_tnn_align4_1sm_bias_f16_relu_aux_scalemax","1","7","(384, 1, 1)","(9, 32, 1)","0","10.0","Launch Statistics","Cluster Size","","0",
"0","371693","cublaslt_gemm","127.0.0.1","cutlass3x_sm100_tensorop_s64x256x32gemm_f8_f8_f32_f16_f16_64x256x128_1x1x1_0_tnn_align4_1sm_bias_f16_relu_aux_scalemax","1","7","(384, 1, 1)","(9, 32, 1)","0","10.0","Launch Statistics","Function Cache Configuration","","CachePreferNone",
"0","371693","cublaslt_gemm","127.0.0.1","cutlass3x_sm100_tensorop_s64x256x32gemm_f8_f8_f32_f16_f16_64x256x128_1x1x1_0_tnn_align4_1sm_bias_f16_relu_aux_scalemax","1","7","(384, 1, 1)","(9, 32, 1)","0","10.0","Launch Statistics","Grid Size","","288",
"0","371693","cublaslt_gemm","127.0.0.1","cutlass3x_sm100_tensorop_s64x256x32gemm_f8_f8_f32_f16_f16_64x256x128_1x1x1_0_tnn_align4_1sm_bias_f16_relu_aux_scalemax","1","7","(384, 1, 1)","(9, 32, 1)","0","10.0","Launch Statistics","Registers Per Thread","register/thread","168",
"0","371693","cublaslt_gemm","127.0.0.1","cutlass3x_sm100_tensorop_s64x256x32gemm_f8_f8_f32_f16_f16_64x256x128_1x1x1_0_tnn_align4_1sm_bias_f16_relu_aux_scalemax","1","7","(384, 1, 1)","(9, 32, 1)","0","10.0","Launch Statistics","Shared Memory Configuration Size","byte","233472",
"0","371693","cublaslt_gemm","127.0.0.1","cutlass3x_sm100_tensorop_s64x256x32gemm_f8_f8_f32_f16_f16_64x256x128_1x1x1_0_tnn_align4_1sm_bias_f16_relu_aux_scalemax","1","7","(384, 1, 1)","(9, 32, 1)","0","10.0","Launch Statistics","Driver Shared Memory Per Block","byte/block","1024",
"0","371693","cublaslt_gemm","127.0.0.1","cutlass3x_sm100_tensorop_s64x256x32gemm_f8_f8_f32_f16_f16_64x256x128_1x1x1_0_tnn_align4_1sm_bias_f16_relu_aux_scalemax","1","7","(384, 1, 1)","(9, 32, 1)","0","10.0","Launch Statistics","Dynamic Shared Memory Per Block","byte/block","205696",
"0","371693","cublaslt_gemm","127.0.0.1","cutlass3x_sm100_tensorop_s64x256x32gemm_f8_f8_f32_f16_f16_64x256x128_1x1x1_0_tnn_align4_1sm_bias_f16_relu_aux_scalemax","1","7","(384, 1, 1)","(9, 32, 1)","0","10.0","Launch Statistics","Static Shared Memory Per Block","byte/block","0",
"0","371693","cublaslt_gemm","127.0.0.1","cutlass3x_sm100_tensorop_s64x256x32gemm_f8_f8_f32_f16_f16_64x256x128_1x1x1_0_tnn_align4_1sm_bias_f16_relu_aux_scalemax","1","7","(384, 1, 1)","(9, 32, 1)","0","10.0","Launch Statistics","# SMs","SM","152",
"0","371693","cublaslt_gemm","127.0.0.1","cutlass3x_sm100_tensorop_s64x256x32gemm_f8_f8_f32_f16_f16_64x256x128_1x1x1_0_tnn_align4_1sm_bias_f16_relu_aux_scalemax","1","7","(384, 1, 1)","(9, 32, 1)","0","10.0","Launch Statistics","Stack Size","","1760",
"0","371693","cublaslt_gemm","127.0.0.1","cutlass3x_sm100_tensorop_s64x256x32gemm_f8_f8_f32_f16_f16_64x256x128_1x1x1_0_tnn_align4_1sm_bias_f16_relu_aux_scalemax","1","7","(384, 1, 1)","(9, 32, 1)","0","10.0","Launch Statistics","Threads","thread","110592",
"0","371693","cublaslt_gemm","127.0.0.1","cutlass3x_sm100_tensorop_s64x256x32gemm_f8_f8_f32_f16_f16_64x256x128_1x1x1_0_tnn_align4_1sm_bias_f16_relu_aux_scalemax","1","7","(384, 1, 1)","(9, 32, 1)","0","10.0","Launch Statistics","# TPCs","","76",
"0","371693","cublaslt_gemm","127.0.0.1","cutlass3x_sm100_tensorop_s64x256x32gemm_f8_f8_f32_f16_f16_64x256x128_1x1x1_0_tnn_align4_1sm_bias_f16_relu_aux_scalemax","1","7","(384, 1, 1)","(9, 32, 1)","0","10.0","Launch Statistics","Enabled TPC IDs","","all",
"0","371693","cublaslt_gemm","127.0.0.1","cutlass3x_sm100_tensorop_s64x256x32gemm_f8_f8_f32_f16_f16_64x256x128_1x1x1_0_tnn_align4_1sm_bias_f16_relu_aux_scalemax","1","7","(384, 1, 1)","(9, 32, 1)","0","10.0","Launch Statistics","Uses Green Context","","0",
"0","371693","cublaslt_gemm","127.0.0.1","cutlass3x_sm100_tensorop_s64x256x32gemm_f8_f8_f32_f16_f16_64x256x128_1x1x1_0_tnn_align4_1sm_bias_f16_relu_aux_scalemax","1","7","(384, 1, 1)","(9, 32, 1)","0","10.0","Launch Statistics","Waves Per SM","","1.89",
"0","371693","cublaslt_gemm","127.0.0.1","cutlass3x_sm100_tensorop_s64x256x32gemm_f8_f8_f32_f16_f16_64x256x128_1x1x1_0_tnn_align4_1sm_bias_f16_relu_aux_scalemax","1","7","(384, 1, 1)","(9, 32, 1)","0","10.0","LaunchStats","","","","LaunchConfiguration","OPT","If you execute __syncthreads() to synchronize the threads of a block, it is recommended to have at least two blocks per multiprocessor (compared to the currently executed 1.9 blocks) This way, blocks that aren't waiting for __syncthreads() can keep the hardware busy.","",""
"0","371693","cublaslt_gemm","127.0.0.1","cutlass3x_sm100_tensorop_s64x256x32gemm_f8_f8_f32_f16_f16_64x256x128_1x1x1_0_tnn_align4_1sm_bias_f16_relu_aux_scalemax","1","7","(384, 1, 1)","(9, 32, 1)","0","10.0","LaunchStats","","","","LaunchConfiguration","OPT","A wave of thread blocks is defined as the maximum number of blocks that can be executed in parallel on the target GPU. The number of blocks in a wave depends on the number of multiprocessors and the theoretical occupancy of the kernel. This kernel launch results in 1 full waves and a partial wave of 136 thread blocks. Under the assumption of a uniform execution duration of all thread blocks, this partial wave may account for up to 50.0% of the total runtime of this kernel. Try launching a grid with no partial wave. The overall impact of this tail effect also lessens with the number of full waves executed for a grid. See the Hardware Model (https://docs.nvidia.com/nsight-compute/ProfilingGuide/index.html#metrics-hw-model) description for more details on launch configurations.","global","50"
"0","371693","cublaslt_gemm","127.0.0.1","cutlass3x_sm100_tensorop_s64x256x32gemm_f8_f8_f32_f16_f16_64x256x128_1x1x1_0_tnn_align4_1sm_bias_f16_relu_aux_scalemax","1","7","(384, 1, 1)","(9, 32, 1)","0","10.0","Occupancy","Max Active Clusters","cluster","0",
"0","371693","cublaslt_gemm","127.0.0.1","cutlass3x_sm100_tensorop_s64x256x32gemm_f8_f8_f32_f16_f16_64x256x128_1x1x1_0_tnn_align4_1sm_bias_f16_relu_aux_scalemax","1","7","(384, 1, 1)","(9, 32, 1)","0","10.0","Occupancy","Max Cluster Size","block","8",
"0","371693","cublaslt_gemm","127.0.0.1","cutlass3x_sm100_tensorop_s64x256x32gemm_f8_f8_f32_f16_f16_64x256x128_1x1x1_0_tnn_align4_1sm_bias_f16_relu_aux_scalemax","1","7","(384, 1, 1)","(9, 32, 1)","0","10.0","Occupancy","Overall GPU Occupancy","%","0",
"0","371693","cublaslt_gemm","127.0.0.1","cutlass3x_sm100_tensorop_s64x256x32gemm_f8_f8_f32_f16_f16_64x256x128_1x1x1_0_tnn_align4_1sm_bias_f16_relu_aux_scalemax","1","7","(384, 1, 1)","(9, 32, 1)","0","10.0","Occupancy","Cluster Occupancy","%","0",
"0","371693","cublaslt_gemm","127.0.0.1","cutlass3x_sm100_tensorop_s64x256x32gemm_f8_f8_f32_f16_f16_64x256x128_1x1x1_0_tnn_align4_1sm_bias_f16_relu_aux_scalemax","1","7","(384, 1, 1)","(9, 32, 1)","0","10.0","Occupancy","Block Limit Barriers","block","9",
"0","371693","cublaslt_gemm","127.0.0.1","cutlass3x_sm100_tensorop_s64x256x32gemm_f8_f8_f32_f16_f16_64x256x128_1x1x1_0_tnn_align4_1sm_bias_f16_relu_aux_scalemax","1","7","(384, 1, 1)","(9, 32, 1)","0","10.0","Occupancy","Block Limit SM","block","32",
"0","371693","cublaslt_gemm","127.0.0.1","cutlass3x_sm100_tensorop_s64x256x32gemm_f8_f8_f32_f16_f16_64x256x128_1x1x1_0_tnn_align4_1sm_bias_f16_relu_aux_scalemax","1","7","(384, 1, 1)","(9, 32, 1)","0","10.0","Occupancy","Block Limit Registers","block","1",
"0","371693","cublaslt_gemm","127.0.0.1","cutlass3x_sm100_tensorop_s64x256x32gemm_f8_f8_f32_f16_f16_64x256x128_1x1x1_0_tnn_align4_1sm_bias_f16_relu_aux_scalemax","1","7","(384, 1, 1)","(9, 32, 1)","0","10.0","Occupancy","Block Limit Shared Mem","block","1",
"0","371693","cublaslt_gemm","127.0.0.1","cutlass3x_sm100_tensorop_s64x256x32gemm_f8_f8_f32_f16_f16_64x256x128_1x1x1_0_tnn_align4_1sm_bias_f16_relu_aux_scalemax","1","7","(384, 1, 1)","(9, 32, 1)","0","10.0","Occupancy","Block Limit Warps","block","5",
"0","371693","cublaslt_gemm","127.0.0.1","cutlass3x_sm100_tensorop_s64x256x32gemm_f8_f8_f32_f16_f16_64x256x128_1x1x1_0_tnn_align4_1sm_bias_f16_relu_aux_scalemax","1","7","(384, 1, 1)","(9, 32, 1)","0","10.0","Occupancy","Theoretical Active Warps per SM","warp","12",
"0","371693","cublaslt_gemm","127.0.0.1","cutlass3x_sm100_tensorop_s64x256x32gemm_f8_f8_f32_f16_f16_64x256x128_1x1x1_0_tnn_align4_1sm_bias_f16_relu_aux_scalemax","1","7","(384, 1, 1)","(9, 32, 1)","0","10.0","Occupancy","Theoretical Occupancy","%","18.75",
"0","371693","cublaslt_gemm","127.0.0.1","cutlass3x_sm100_tensorop_s64x256x32gemm_f8_f8_f32_f16_f16_64x256x128_1x1x1_0_tnn_align4_1sm_bias_f16_relu_aux_scalemax","1","7","(384, 1, 1)","(9, 32, 1)","0","10.0","Occupancy","Achieved Occupancy","%","15.53",
"0","371693","cublaslt_gemm","127.0.0.1","cutlass3x_sm100_tensorop_s64x256x32gemm_f8_f8_f32_f16_f16_64x256x128_1x1x1_0_tnn_align4_1sm_bias_f16_relu_aux_scalemax","1","7","(384, 1, 1)","(9, 32, 1)","0","10.0","Occupancy","Achieved Active Warps Per SM","warp","9.94",
"0","371693","cublaslt_gemm","127.0.0.1","cutlass3x_sm100_tensorop_s64x256x32gemm_f8_f8_f32_f16_f16_64x256x128_1x1x1_0_tnn_align4_1sm_bias_f16_relu_aux_scalemax","1","7","(384, 1, 1)","(9, 32, 1)","0","10.0","Occupancy","","","","TheoreticalOccupancy","OPT","The 3.00 theoretical warps per scheduler this kernel can issue according to its occupancy are below the hardware maximum of 16. This kernel's theoretical occupancy (18.8%) is limited by the number of required registers, and the required amount of shared memory.","local","81.25"
"0","371693","cublaslt_gemm","127.0.0.1","cutlass3x_sm100_tensorop_s64x256x32gemm_f8_f8_f32_f16_f16_64x256x128_1x1x1_0_tnn_align4_1sm_bias_f16_relu_aux_scalemax","1","7","(384, 1, 1)","(9, 32, 1)","0","10.0","GPU and Memory Workload Distribution","Average DRAM Active Cycles","cycle","12018",
"0","371693","cublaslt_gemm","127.0.0.1","cutlass3x_sm100_tensorop_s64x256x32gemm_f8_f8_f32_f16_f16_64x256x128_1x1x1_0_tnn_align4_1sm_bias_f16_relu_aux_scalemax","1","7","(384, 1, 1)","(9, 32, 1)","0","10.0","GPU and Memory Workload Distribution","Total DRAM Elapsed Cycles","cycle","103875584",
"0","371693","cublaslt_gemm","127.0.0.1","cutlass3x_sm100_tensorop_s64x256x32gemm_f8_f8_f32_f16_f16_64x256x128_1x1x1_0_tnn_align4_1sm_bias_f16_relu_aux_scalemax","1","7","(384, 1, 1)","(9, 32, 1)","0","10.0","GPU and Memory Workload Distribution","Average L1 Active Cycles","cycle","462061.91",
"0","371693","cublaslt_gemm","127.0.0.1","cutlass3x_sm100_tensorop_s64x256x32gemm_f8_f8_f32_f16_f16_64x256x128_1x1x1_0_tnn_align4_1sm_bias_f16_relu_aux_scalemax","1","7","(384, 1, 1)","(9, 32, 1)","0","10.0","GPU and Memory Workload Distribution","Total L1 Elapsed Cycles","cycle","75238502",
"0","371693","cublaslt_gemm","127.0.0.1","cutlass3x_sm100_tensorop_s64x256x32gemm_f8_f8_f32_f16_f16_64x256x128_1x1x1_0_tnn_align4_1sm_bias_f16_relu_aux_scalemax","1","7","(384, 1, 1)","(9, 32, 1)","0","10.0","GPU and Memory Workload Distribution","Average L2 Active Cycles","cycle","791529.25",
"0","371693","cublaslt_gemm","127.0.0.1","cutlass3x_sm100_tensorop_s64x256x32gemm_f8_f8_f32_f16_f16_64x256x128_1x1x1_0_tnn_align4_1sm_bias_f16_relu_aux_scalemax","1","7","(384, 1, 1)","(9, 32, 1)","0","10.0","GPU and Memory Workload Distribution","Total L2 Elapsed Cycles","cycle","127183034",
"0","371693","cublaslt_gemm","127.0.0.1","cutlass3x_sm100_tensorop_s64x256x32gemm_f8_f8_f32_f16_f16_64x256x128_1x1x1_0_tnn_align4_1sm_bias_f16_relu_aux_scalemax","1","7","(384, 1, 1)","(9, 32, 1)","0","10.0","GPU and Memory Workload Distribution","Average SM Active Cycles","cycle","462061.91",
"0","371693","cublaslt_gemm","127.0.0.1","cutlass3x_sm100_tensorop_s64x256x32gemm_f8_f8_f32_f16_f16_64x256x128_1x1x1_0_tnn_align4_1sm_bias_f16_relu_aux_scalemax","1","7","(384, 1, 1)","(9, 32, 1)","0","10.0","GPU and Memory Workload Distribution","Total SM Elapsed Cycles","cycle","75238502",
"0","371693","cublaslt_gemm","127.0.0.1","cutlass3x_sm100_tensorop_s64x256x32gemm_f8_f8_f32_f16_f16_64x256x128_1x1x1_0_tnn_align4_1sm_bias_f16_relu_aux_scalemax","1","7","(384, 1, 1)","(9, 32, 1)","0","10.0","GPU and Memory Workload Distribution","Average SMSP Active Cycles","cycle","461606.38",
"0","371693","cublaslt_gemm","127.0.0.1","cutlass3x_sm100_tensorop_s64x256x32gemm_f8_f8_f32_f16_f16_64x256x128_1x1x1_0_tnn_align4_1sm_bias_f16_relu_aux_scalemax","1","7","(384, 1, 1)","(9, 32, 1)","0","10.0","GPU and Memory Workload Distribution","Total SMSP Elapsed Cycles","cycle","300954008",
"0","371693","cublaslt_gemm","127.0.0.1","cutlass3x_sm100_tensorop_s64x256x32gemm_f8_f8_f32_f16_f16_64x256x128_1x1x1_0_tnn_align4_1sm_bias_f16_relu_aux_scalemax","1","7","(384, 1, 1)","(9, 32, 1)","0","10.0","WorkloadDistribution","","","","WorkloadImbalance","OPT","One or more SMs have a much lower number of active cycles than the average number of active cycles. Maximum instance value is 6.23% above the average, while the minimum instance value is 40.85% below the average.","global","5.818"
"0","371693","cublaslt_gemm","127.0.0.1","cutlass3x_sm100_tensorop_s64x256x32gemm_f8_f8_f32_f16_f16_64x256x128_1x1x1_0_tnn_align4_1sm_bias_f16_relu_aux_scalemax","1","7","(384, 1, 1)","(9, 32, 1)","0","10.0","WorkloadDistribution","","","","WorkloadImbalance","OPT","One or more SMSPs have a much lower number of active cycles than the average number of active cycles. Maximum instance value is 6.69% above the average, while the minimum instance value is 41.54% below the average.","global","6.24"
"0","371693","cublaslt_gemm","127.0.0.1","cutlass3x_sm100_tensorop_s64x256x32gemm_f8_f8_f32_f16_f16_64x256x128_1x1x1_0_tnn_align4_1sm_bias_f16_relu_aux_scalemax","1","7","(384, 1, 1)","(9, 32, 1)","0","10.0","WorkloadDistribution","","","","WorkloadImbalance","OPT","One or more L1 Slices have a much lower number of active cycles than the average number of active cycles. Maximum instance value is 6.23% above the average, while the minimum instance value is 40.85% below the average.","global","5.818"
"0","371693","cublaslt_gemm","127.0.0.1","cutlass3x_sm100_tensorop_s64x256x32gemm_f8_f8_f32_f16_f16_64x256x128_1x1x1_0_tnn_align4_1sm_bias_f16_relu_aux_scalemax","1","7","(384, 1, 1)","(9, 32, 1)","0","10.0","WorkloadDistribution","","","","WorkloadImbalance","OPT","One or more L2 Slices have a much lower number of active cycles than the average number of active cycles. Maximum instance value is 12.44% above the average, while the minimum instance value is 19.49% below the average.","global","14.55"
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment