"git@developer.sourcefind.cn:yangql/googletest.git" did not exist on "3549237957a111154abe3594966395c3aeee4a37"
Unverified Commit f6e65a98 authored by Yuting Jiang's avatar Yuting Jiang Committed by GitHub
Browse files

Benchmarks: Micro benchmark - add ncu profile support in cublaslt-gemm (#740)

**Description**
This PR adds NCU (NVIDIA Nsight Compute) profiling support to the
cublaslt-gemm micro benchmark, enabling detailed kernel analysis
including DRAM throughput, compute throughput, and launch arguments.

**Major Revision**
- Add --enable_ncu_profiling and --profiling_metrics for ncu profiling
- Modifies command execution to use NCU when profiling is enabled
- Updates result parsing to handle both standard and NCU profiled output
formats
parent fe234262
...@@ -56,6 +56,20 @@ def add_parser_arguments(self): ...@@ -56,6 +56,20 @@ def add_parser_arguments(self):
required=False, required=False,
help='Number of steps to measure for autotune.', help='Number of steps to measure for autotune.',
) )
self._parser.add_argument(
'--enable_ncu_profiling',
action='store_true',
required=False,
help='Enable ncu profiling for each run.',
)
self._parser.add_argument(
'--profiling_metrics',
type=str,
nargs='+',
default=None,
required=False,
help='List of ncu profiling metrics, support all ncu metrics.',
)
def _preprocess(self): def _preprocess(self):
"""Preprocess/preparation operations before the benchmarking. """Preprocess/preparation operations before the benchmarking.
...@@ -75,16 +89,17 @@ def _preprocess(self): ...@@ -75,16 +89,17 @@ def _preprocess(self):
f' -a -W {self._args.num_warmup_autotune}' f' -a -W {self._args.num_warmup_autotune}'
f' -I {self._args.num_steps_autotune}' f' -I {self._args.num_steps_autotune}'
) if self._args.enable_autotune else '' ) if self._args.enable_autotune else ''
command = f'{self.__bin_path} -m {_m} -n {_n} -k {_k} -b {_b} ' + \
self._commands.append( f'-w {self._args.num_warmup} -i {self._args.num_steps} -t {_in_type}' + \
f'{self.__bin_path} -m {_m} -n {_n} -k {_k} -b {_b} '
f'-w {self._args.num_warmup} -i {self._args.num_steps} -t {_in_type}'
f'{(" " + autotune_args) if autotune_args else ""}' f'{(" " + autotune_args) if autotune_args else ""}'
) if self._args.enable_ncu_profiling:
skip_num = self._args.num_warmup - 1 if self._args.num_warmup > 1 else 0
command = f'ncu --set full --launch-skip {skip_num} --launch-count 1 --csv ' + command
self._commands.append(command)
return True return True
def _process_raw_result(self, cmd_idx, raw_output): def _process_raw_result(self, cmd_idx, raw_output): # noqa: C901
"""Function to parse raw results and save the summarized results. """Function to parse raw results and save the summarized results.
self._result.add_raw_data() and self._result.add_result() need to be called to save the results. self._result.add_raw_data() and self._result.add_result() need to be called to save the results.
...@@ -99,12 +114,52 @@ def _process_raw_result(self, cmd_idx, raw_output): ...@@ -99,12 +114,52 @@ def _process_raw_result(self, cmd_idx, raw_output):
self._result.add_raw_data(f'raw_output_{cmd_idx}', raw_output, self._args.log_raw_data) self._result.add_raw_data(f'raw_output_{cmd_idx}', raw_output, self._args.log_raw_data)
try: try:
fields = raw_output.strip().split() if not self._args.enable_ncu_profiling:
if len(fields) != 6 or not all(x.isdigit() for x in fields[:4]): fields = raw_output.strip().split()
raise ValueError('Invalid result.') if len(fields) != 6 or not all(x.isdigit() for x in fields[:4]):
self._result.add_result( raise ValueError('Invalid result.')
f'{self._commands[cmd_idx].split()[-1]}_{fields[3]}_{"_".join(fields[:3])}_flops', float(fields[-1]) self._result.add_result(
) f'{self._commands[cmd_idx].split()[-1]}_{fields[3]}_{"_".join(fields[:3])}_flops',
float(fields[-1])
)
else:
lines = raw_output.strip().split('\n')
# find line index of the line that starts with "ID","Process ID"
start_idx = next(i for i, line in enumerate(lines) if 'Metric Name' in line)
if start_idx == 0 or start_idx == len(lines) - 1:
raise ValueError('Invalid result.')
result_lines = lines[0:start_idx - 1]
result = False
size = ''
for line in result_lines:
fields = line.strip().split()
if len(fields) == 6 and all(x.isdigit() for x in fields[:4]):
result = True
size = f'{fields[3]}_{"_".join(fields[:3])}'
self._result.add_result(
f'{self._commands[cmd_idx].split()[-1]}_{fields[3]}_{"_".join(fields[:3])}_flops',
float(fields[-1])
)
if not result:
raise ValueError('Invalid result.')
metric_name_index = lines[start_idx].strip().split(',').index('"Metric Name"')
metric_value_index = lines[start_idx].strip().split(',').index('"Metric Value"')
if metric_name_index < 0 or metric_value_index < 0:
raise ValueError('Can not find Metric Name and Value.')
for line in lines[start_idx + 1:]:
fields = line.strip().split('","')
metric_name = fields[metric_name_index].strip('"').replace(' ', '_')
if len(fields) < 15:
continue
if not self._args.profiling_metrics or metric_name in self._args.profiling_metrics:
value = fields[metric_value_index].strip(',').strip('"')
try:
float_value = float(value)
self._result.add_result(
f'{self._commands[cmd_idx].split()[-1]}_{size}_{metric_name}', float_value
)
except ValueError:
pass
except BaseException as e: except BaseException as e:
self._result.set_return_code(ReturnCode.MICROBENCHMARK_RESULT_PARSING_FAILURE) self._result.set_return_code(ReturnCode.MICROBENCHMARK_RESULT_PARSING_FAILURE)
logger.error( logger.error(
......
...@@ -6,6 +6,7 @@ ...@@ -6,6 +6,7 @@
import unittest import unittest
from types import GeneratorType, SimpleNamespace from types import GeneratorType, SimpleNamespace
from tests.helper import decorator
from tests.helper.testcase import BenchmarkTestCase from tests.helper.testcase import BenchmarkTestCase
from superbench.benchmarks import BenchmarkRegistry, BenchmarkType, ReturnCode, Platform from superbench.benchmarks import BenchmarkRegistry, BenchmarkType, ReturnCode, Platform
from superbench.benchmarks.result import BenchmarkResult from superbench.benchmarks.result import BenchmarkResult
...@@ -83,11 +84,14 @@ def cmd(t, b, m, n, k): ...@@ -83,11 +84,14 @@ def cmd(t, b, m, n, k):
for _m in [32, 128]: for _m in [32, 128]:
self.assertIn(cmd(_t, _b, _m, 128, 128), benchmark._commands) self.assertIn(cmd(_t, _b, _m, 128, 128), benchmark._commands)
def test_cublaslt_gemm_result_parsing(self): @decorator.load_data('tests/data/cublaslt_ncu.log')
def test_cublaslt_gemm_result_parsing(self, raw_output):
"""Test cublaslt-gemm benchmark result parsing.""" """Test cublaslt-gemm benchmark result parsing."""
benchmark = self.get_benchmark() benchmark = self.get_benchmark()
self.assertTrue(benchmark._preprocess()) self.assertTrue(benchmark._preprocess())
benchmark._args = SimpleNamespace(shapes=['16,16,16', '32,64,128'], in_types=['fp8e4m3'], log_raw_data=False) benchmark._args = SimpleNamespace(
shapes=['16,16,16', '32,64,128'], in_types=['fp8e4m3'], log_raw_data=False, enable_ncu_profiling=False
)
benchmark._result = BenchmarkResult(self.benchmark_name, BenchmarkType.MICRO, ReturnCode.SUCCESS, run_count=1) benchmark._result = BenchmarkResult(self.benchmark_name, BenchmarkType.MICRO, ReturnCode.SUCCESS, run_count=1)
# Positive case - valid raw output # Positive case - valid raw output
...@@ -101,3 +105,15 @@ def test_cublaslt_gemm_result_parsing(self): ...@@ -101,3 +105,15 @@ def test_cublaslt_gemm_result_parsing(self):
# Negative case - invalid raw output # Negative case - invalid raw output
self.assertFalse(benchmark._process_raw_result(1, 'cuBLAS API failed')) self.assertFalse(benchmark._process_raw_result(1, 'cuBLAS API failed'))
# Positive case - valid ncu raw output
benchmark._args = SimpleNamespace(
shapes=['2208,2048,5608'],
in_types=['fp8e4m3'],
log_raw_data=False,
enable_ncu_profiling=True,
profiling_metrics=['DRAM_Throughput'],
)
benchmark._result = BenchmarkResult(self.benchmark_name, BenchmarkType.MICRO, ReturnCode.SUCCESS, run_count=1)
self.assertTrue(benchmark._process_raw_result(1, raw_output))
self.assertEqual(0.74, benchmark.result['fp8e4m3_0_2208_2048_5608_DRAM_Throughput'][0])
==PROF== Connected to process 371693 (/opt/superbench/bin/cublaslt_gemm)
2208 2048 5608 0 358.154755 141.598150
==PROF== Disconnected from process 371693
"ID","Process ID","Process Name","Host Name","Kernel Name","Context","Stream","Block Size","Grid Size","Device","CC","Section Name","Metric Name","Metric Unit","Metric Value","Rule Name","Rule Type","Rule Description","Estimated Speedup Type","Estimated Speedup"
"0","371693","cublaslt_gemm","127.0.0.1","cutlass3x_sm100_tensorop_s64x256x32gemm_f8_f8_f32_f16_f16_64x256x128_1x1x1_0_tnn_align4_1sm_bias_f16_relu_aux_scalemax","1","7","(384, 1, 1)","(9, 32, 1)","0","10.0","GPU Speed Of Light Throughput","DRAM Frequency","hz","3995313115.40",
"0","371693","cublaslt_gemm","127.0.0.1","cutlass3x_sm100_tensorop_s64x256x32gemm_f8_f8_f32_f16_f16_64x256x128_1x1x1_0_tnn_align4_1sm_bias_f16_relu_aux_scalemax","1","7","(384, 1, 1)","(9, 32, 1)","0","10.0","GPU Speed Of Light Throughput","SM Frequency","hz","1239496049.13",
"0","371693","cublaslt_gemm","127.0.0.1","cutlass3x_sm100_tensorop_s64x256x32gemm_f8_f8_f32_f16_f16_64x256x128_1x1x1_0_tnn_align4_1sm_bias_f16_relu_aux_scalemax","1","7","(384, 1, 1)","(9, 32, 1)","0","10.0","GPU Speed Of Light Throughput","Elapsed Cycles","cycle","508757",
"0","371693","cublaslt_gemm","127.0.0.1","cutlass3x_sm100_tensorop_s64x256x32gemm_f8_f8_f32_f16_f16_64x256x128_1x1x1_0_tnn_align4_1sm_bias_f16_relu_aux_scalemax","1","7","(384, 1, 1)","(9, 32, 1)","0","10.0","GPU Speed Of Light Throughput","Memory Throughput","%","21.68",
"0","371693","cublaslt_gemm","127.0.0.1","cutlass3x_sm100_tensorop_s64x256x32gemm_f8_f8_f32_f16_f16_64x256x128_1x1x1_0_tnn_align4_1sm_bias_f16_relu_aux_scalemax","1","7","(384, 1, 1)","(9, 32, 1)","0","10.0","GPU Speed Of Light Throughput","DRAM Throughput","%","0.74",
"0","371693","cublaslt_gemm","127.0.0.1","cutlass3x_sm100_tensorop_s64x256x32gemm_f8_f8_f32_f16_f16_64x256x128_1x1x1_0_tnn_align4_1sm_bias_f16_relu_aux_scalemax","1","7","(384, 1, 1)","(9, 32, 1)","0","10.0","GPU Speed Of Light Throughput","Duration","ns","406240",
"0","371693","cublaslt_gemm","127.0.0.1","cutlass3x_sm100_tensorop_s64x256x32gemm_f8_f8_f32_f16_f16_64x256x128_1x1x1_0_tnn_align4_1sm_bias_f16_relu_aux_scalemax","1","7","(384, 1, 1)","(9, 32, 1)","0","10.0","GPU Speed Of Light Throughput","L1/TEX Cache Throughput","%","23.23",
"0","371693","cublaslt_gemm","127.0.0.1","cutlass3x_sm100_tensorop_s64x256x32gemm_f8_f8_f32_f16_f16_64x256x128_1x1x1_0_tnn_align4_1sm_bias_f16_relu_aux_scalemax","1","7","(384, 1, 1)","(9, 32, 1)","0","10.0","GPU Speed Of Light Throughput","L2 Cache Throughput","%","15.12",
"0","371693","cublaslt_gemm","127.0.0.1","cutlass3x_sm100_tensorop_s64x256x32gemm_f8_f8_f32_f16_f16_64x256x128_1x1x1_0_tnn_align4_1sm_bias_f16_relu_aux_scalemax","1","7","(384, 1, 1)","(9, 32, 1)","0","10.0","GPU Speed Of Light Throughput","SM Active Cycles","cycle","462061.91",
"0","371693","cublaslt_gemm","127.0.0.1","cutlass3x_sm100_tensorop_s64x256x32gemm_f8_f8_f32_f16_f16_64x256x128_1x1x1_0_tnn_align4_1sm_bias_f16_relu_aux_scalemax","1","7","(384, 1, 1)","(9, 32, 1)","0","10.0","GPU Speed Of Light Throughput","Compute (SM) Throughput","%","23.59",
"0","371693","cublaslt_gemm","127.0.0.1","cutlass3x_sm100_tensorop_s64x256x32gemm_f8_f8_f32_f16_f16_64x256x128_1x1x1_0_tnn_align4_1sm_bias_f16_relu_aux_scalemax","1","7","(384, 1, 1)","(9, 32, 1)","0","10.0","SpeedOfLight","","","","SOLBottleneck","OPT","This workload exhibits low compute throughput and memory bandwidth utilization relative to the peak performance of this device. Achieved compute throughput and/or memory bandwidth below 60.0% of peak typically indicate latency issues. Look at Scheduler Statistics and Warp State Statistics for potential reasons.","",""
"0","371693","cublaslt_gemm","127.0.0.1","cutlass3x_sm100_tensorop_s64x256x32gemm_f8_f8_f32_f16_f16_64x256x128_1x1x1_0_tnn_align4_1sm_bias_f16_relu_aux_scalemax","1","7","(384, 1, 1)","(9, 32, 1)","0","10.0","Launch Statistics","Block Size","","384",
"0","371693","cublaslt_gemm","127.0.0.1","cutlass3x_sm100_tensorop_s64x256x32gemm_f8_f8_f32_f16_f16_64x256x128_1x1x1_0_tnn_align4_1sm_bias_f16_relu_aux_scalemax","1","7","(384, 1, 1)","(9, 32, 1)","0","10.0","Launch Statistics","Cluster Scheduling Policy","","PolicySpread",
"0","371693","cublaslt_gemm","127.0.0.1","cutlass3x_sm100_tensorop_s64x256x32gemm_f8_f8_f32_f16_f16_64x256x128_1x1x1_0_tnn_align4_1sm_bias_f16_relu_aux_scalemax","1","7","(384, 1, 1)","(9, 32, 1)","0","10.0","Launch Statistics","Cluster Size","","0",
"0","371693","cublaslt_gemm","127.0.0.1","cutlass3x_sm100_tensorop_s64x256x32gemm_f8_f8_f32_f16_f16_64x256x128_1x1x1_0_tnn_align4_1sm_bias_f16_relu_aux_scalemax","1","7","(384, 1, 1)","(9, 32, 1)","0","10.0","Launch Statistics","Function Cache Configuration","","CachePreferNone",
"0","371693","cublaslt_gemm","127.0.0.1","cutlass3x_sm100_tensorop_s64x256x32gemm_f8_f8_f32_f16_f16_64x256x128_1x1x1_0_tnn_align4_1sm_bias_f16_relu_aux_scalemax","1","7","(384, 1, 1)","(9, 32, 1)","0","10.0","Launch Statistics","Grid Size","","288",
"0","371693","cublaslt_gemm","127.0.0.1","cutlass3x_sm100_tensorop_s64x256x32gemm_f8_f8_f32_f16_f16_64x256x128_1x1x1_0_tnn_align4_1sm_bias_f16_relu_aux_scalemax","1","7","(384, 1, 1)","(9, 32, 1)","0","10.0","Launch Statistics","Registers Per Thread","register/thread","168",
"0","371693","cublaslt_gemm","127.0.0.1","cutlass3x_sm100_tensorop_s64x256x32gemm_f8_f8_f32_f16_f16_64x256x128_1x1x1_0_tnn_align4_1sm_bias_f16_relu_aux_scalemax","1","7","(384, 1, 1)","(9, 32, 1)","0","10.0","Launch Statistics","Shared Memory Configuration Size","byte","233472",
"0","371693","cublaslt_gemm","127.0.0.1","cutlass3x_sm100_tensorop_s64x256x32gemm_f8_f8_f32_f16_f16_64x256x128_1x1x1_0_tnn_align4_1sm_bias_f16_relu_aux_scalemax","1","7","(384, 1, 1)","(9, 32, 1)","0","10.0","Launch Statistics","Driver Shared Memory Per Block","byte/block","1024",
"0","371693","cublaslt_gemm","127.0.0.1","cutlass3x_sm100_tensorop_s64x256x32gemm_f8_f8_f32_f16_f16_64x256x128_1x1x1_0_tnn_align4_1sm_bias_f16_relu_aux_scalemax","1","7","(384, 1, 1)","(9, 32, 1)","0","10.0","Launch Statistics","Dynamic Shared Memory Per Block","byte/block","205696",
"0","371693","cublaslt_gemm","127.0.0.1","cutlass3x_sm100_tensorop_s64x256x32gemm_f8_f8_f32_f16_f16_64x256x128_1x1x1_0_tnn_align4_1sm_bias_f16_relu_aux_scalemax","1","7","(384, 1, 1)","(9, 32, 1)","0","10.0","Launch Statistics","Static Shared Memory Per Block","byte/block","0",
"0","371693","cublaslt_gemm","127.0.0.1","cutlass3x_sm100_tensorop_s64x256x32gemm_f8_f8_f32_f16_f16_64x256x128_1x1x1_0_tnn_align4_1sm_bias_f16_relu_aux_scalemax","1","7","(384, 1, 1)","(9, 32, 1)","0","10.0","Launch Statistics","# SMs","SM","152",
"0","371693","cublaslt_gemm","127.0.0.1","cutlass3x_sm100_tensorop_s64x256x32gemm_f8_f8_f32_f16_f16_64x256x128_1x1x1_0_tnn_align4_1sm_bias_f16_relu_aux_scalemax","1","7","(384, 1, 1)","(9, 32, 1)","0","10.0","Launch Statistics","Stack Size","","1760",
"0","371693","cublaslt_gemm","127.0.0.1","cutlass3x_sm100_tensorop_s64x256x32gemm_f8_f8_f32_f16_f16_64x256x128_1x1x1_0_tnn_align4_1sm_bias_f16_relu_aux_scalemax","1","7","(384, 1, 1)","(9, 32, 1)","0","10.0","Launch Statistics","Threads","thread","110592",
"0","371693","cublaslt_gemm","127.0.0.1","cutlass3x_sm100_tensorop_s64x256x32gemm_f8_f8_f32_f16_f16_64x256x128_1x1x1_0_tnn_align4_1sm_bias_f16_relu_aux_scalemax","1","7","(384, 1, 1)","(9, 32, 1)","0","10.0","Launch Statistics","# TPCs","","76",
"0","371693","cublaslt_gemm","127.0.0.1","cutlass3x_sm100_tensorop_s64x256x32gemm_f8_f8_f32_f16_f16_64x256x128_1x1x1_0_tnn_align4_1sm_bias_f16_relu_aux_scalemax","1","7","(384, 1, 1)","(9, 32, 1)","0","10.0","Launch Statistics","Enabled TPC IDs","","all",
"0","371693","cublaslt_gemm","127.0.0.1","cutlass3x_sm100_tensorop_s64x256x32gemm_f8_f8_f32_f16_f16_64x256x128_1x1x1_0_tnn_align4_1sm_bias_f16_relu_aux_scalemax","1","7","(384, 1, 1)","(9, 32, 1)","0","10.0","Launch Statistics","Uses Green Context","","0",
"0","371693","cublaslt_gemm","127.0.0.1","cutlass3x_sm100_tensorop_s64x256x32gemm_f8_f8_f32_f16_f16_64x256x128_1x1x1_0_tnn_align4_1sm_bias_f16_relu_aux_scalemax","1","7","(384, 1, 1)","(9, 32, 1)","0","10.0","Launch Statistics","Waves Per SM","","1.89",
"0","371693","cublaslt_gemm","127.0.0.1","cutlass3x_sm100_tensorop_s64x256x32gemm_f8_f8_f32_f16_f16_64x256x128_1x1x1_0_tnn_align4_1sm_bias_f16_relu_aux_scalemax","1","7","(384, 1, 1)","(9, 32, 1)","0","10.0","LaunchStats","","","","LaunchConfiguration","OPT","If you execute __syncthreads() to synchronize the threads of a block, it is recommended to have at least two blocks per multiprocessor (compared to the currently executed 1.9 blocks) This way, blocks that aren't waiting for __syncthreads() can keep the hardware busy.","",""
"0","371693","cublaslt_gemm","127.0.0.1","cutlass3x_sm100_tensorop_s64x256x32gemm_f8_f8_f32_f16_f16_64x256x128_1x1x1_0_tnn_align4_1sm_bias_f16_relu_aux_scalemax","1","7","(384, 1, 1)","(9, 32, 1)","0","10.0","LaunchStats","","","","LaunchConfiguration","OPT","A wave of thread blocks is defined as the maximum number of blocks that can be executed in parallel on the target GPU. The number of blocks in a wave depends on the number of multiprocessors and the theoretical occupancy of the kernel. This kernel launch results in 1 full waves and a partial wave of 136 thread blocks. Under the assumption of a uniform execution duration of all thread blocks, this partial wave may account for up to 50.0% of the total runtime of this kernel. Try launching a grid with no partial wave. The overall impact of this tail effect also lessens with the number of full waves executed for a grid. See the Hardware Model (https://docs.nvidia.com/nsight-compute/ProfilingGuide/index.html#metrics-hw-model) description for more details on launch configurations.","global","50"
"0","371693","cublaslt_gemm","127.0.0.1","cutlass3x_sm100_tensorop_s64x256x32gemm_f8_f8_f32_f16_f16_64x256x128_1x1x1_0_tnn_align4_1sm_bias_f16_relu_aux_scalemax","1","7","(384, 1, 1)","(9, 32, 1)","0","10.0","Occupancy","Max Active Clusters","cluster","0",
"0","371693","cublaslt_gemm","127.0.0.1","cutlass3x_sm100_tensorop_s64x256x32gemm_f8_f8_f32_f16_f16_64x256x128_1x1x1_0_tnn_align4_1sm_bias_f16_relu_aux_scalemax","1","7","(384, 1, 1)","(9, 32, 1)","0","10.0","Occupancy","Max Cluster Size","block","8",
"0","371693","cublaslt_gemm","127.0.0.1","cutlass3x_sm100_tensorop_s64x256x32gemm_f8_f8_f32_f16_f16_64x256x128_1x1x1_0_tnn_align4_1sm_bias_f16_relu_aux_scalemax","1","7","(384, 1, 1)","(9, 32, 1)","0","10.0","Occupancy","Overall GPU Occupancy","%","0",
"0","371693","cublaslt_gemm","127.0.0.1","cutlass3x_sm100_tensorop_s64x256x32gemm_f8_f8_f32_f16_f16_64x256x128_1x1x1_0_tnn_align4_1sm_bias_f16_relu_aux_scalemax","1","7","(384, 1, 1)","(9, 32, 1)","0","10.0","Occupancy","Cluster Occupancy","%","0",
"0","371693","cublaslt_gemm","127.0.0.1","cutlass3x_sm100_tensorop_s64x256x32gemm_f8_f8_f32_f16_f16_64x256x128_1x1x1_0_tnn_align4_1sm_bias_f16_relu_aux_scalemax","1","7","(384, 1, 1)","(9, 32, 1)","0","10.0","Occupancy","Block Limit Barriers","block","9",
"0","371693","cublaslt_gemm","127.0.0.1","cutlass3x_sm100_tensorop_s64x256x32gemm_f8_f8_f32_f16_f16_64x256x128_1x1x1_0_tnn_align4_1sm_bias_f16_relu_aux_scalemax","1","7","(384, 1, 1)","(9, 32, 1)","0","10.0","Occupancy","Block Limit SM","block","32",
"0","371693","cublaslt_gemm","127.0.0.1","cutlass3x_sm100_tensorop_s64x256x32gemm_f8_f8_f32_f16_f16_64x256x128_1x1x1_0_tnn_align4_1sm_bias_f16_relu_aux_scalemax","1","7","(384, 1, 1)","(9, 32, 1)","0","10.0","Occupancy","Block Limit Registers","block","1",
"0","371693","cublaslt_gemm","127.0.0.1","cutlass3x_sm100_tensorop_s64x256x32gemm_f8_f8_f32_f16_f16_64x256x128_1x1x1_0_tnn_align4_1sm_bias_f16_relu_aux_scalemax","1","7","(384, 1, 1)","(9, 32, 1)","0","10.0","Occupancy","Block Limit Shared Mem","block","1",
"0","371693","cublaslt_gemm","127.0.0.1","cutlass3x_sm100_tensorop_s64x256x32gemm_f8_f8_f32_f16_f16_64x256x128_1x1x1_0_tnn_align4_1sm_bias_f16_relu_aux_scalemax","1","7","(384, 1, 1)","(9, 32, 1)","0","10.0","Occupancy","Block Limit Warps","block","5",
"0","371693","cublaslt_gemm","127.0.0.1","cutlass3x_sm100_tensorop_s64x256x32gemm_f8_f8_f32_f16_f16_64x256x128_1x1x1_0_tnn_align4_1sm_bias_f16_relu_aux_scalemax","1","7","(384, 1, 1)","(9, 32, 1)","0","10.0","Occupancy","Theoretical Active Warps per SM","warp","12",
"0","371693","cublaslt_gemm","127.0.0.1","cutlass3x_sm100_tensorop_s64x256x32gemm_f8_f8_f32_f16_f16_64x256x128_1x1x1_0_tnn_align4_1sm_bias_f16_relu_aux_scalemax","1","7","(384, 1, 1)","(9, 32, 1)","0","10.0","Occupancy","Theoretical Occupancy","%","18.75",
"0","371693","cublaslt_gemm","127.0.0.1","cutlass3x_sm100_tensorop_s64x256x32gemm_f8_f8_f32_f16_f16_64x256x128_1x1x1_0_tnn_align4_1sm_bias_f16_relu_aux_scalemax","1","7","(384, 1, 1)","(9, 32, 1)","0","10.0","Occupancy","Achieved Occupancy","%","15.53",
"0","371693","cublaslt_gemm","127.0.0.1","cutlass3x_sm100_tensorop_s64x256x32gemm_f8_f8_f32_f16_f16_64x256x128_1x1x1_0_tnn_align4_1sm_bias_f16_relu_aux_scalemax","1","7","(384, 1, 1)","(9, 32, 1)","0","10.0","Occupancy","Achieved Active Warps Per SM","warp","9.94",
"0","371693","cublaslt_gemm","127.0.0.1","cutlass3x_sm100_tensorop_s64x256x32gemm_f8_f8_f32_f16_f16_64x256x128_1x1x1_0_tnn_align4_1sm_bias_f16_relu_aux_scalemax","1","7","(384, 1, 1)","(9, 32, 1)","0","10.0","Occupancy","","","","TheoreticalOccupancy","OPT","The 3.00 theoretical warps per scheduler this kernel can issue according to its occupancy are below the hardware maximum of 16. This kernel's theoretical occupancy (18.8%) is limited by the number of required registers, and the required amount of shared memory.","local","81.25"
"0","371693","cublaslt_gemm","127.0.0.1","cutlass3x_sm100_tensorop_s64x256x32gemm_f8_f8_f32_f16_f16_64x256x128_1x1x1_0_tnn_align4_1sm_bias_f16_relu_aux_scalemax","1","7","(384, 1, 1)","(9, 32, 1)","0","10.0","GPU and Memory Workload Distribution","Average DRAM Active Cycles","cycle","12018",
"0","371693","cublaslt_gemm","127.0.0.1","cutlass3x_sm100_tensorop_s64x256x32gemm_f8_f8_f32_f16_f16_64x256x128_1x1x1_0_tnn_align4_1sm_bias_f16_relu_aux_scalemax","1","7","(384, 1, 1)","(9, 32, 1)","0","10.0","GPU and Memory Workload Distribution","Total DRAM Elapsed Cycles","cycle","103875584",
"0","371693","cublaslt_gemm","127.0.0.1","cutlass3x_sm100_tensorop_s64x256x32gemm_f8_f8_f32_f16_f16_64x256x128_1x1x1_0_tnn_align4_1sm_bias_f16_relu_aux_scalemax","1","7","(384, 1, 1)","(9, 32, 1)","0","10.0","GPU and Memory Workload Distribution","Average L1 Active Cycles","cycle","462061.91",
"0","371693","cublaslt_gemm","127.0.0.1","cutlass3x_sm100_tensorop_s64x256x32gemm_f8_f8_f32_f16_f16_64x256x128_1x1x1_0_tnn_align4_1sm_bias_f16_relu_aux_scalemax","1","7","(384, 1, 1)","(9, 32, 1)","0","10.0","GPU and Memory Workload Distribution","Total L1 Elapsed Cycles","cycle","75238502",
"0","371693","cublaslt_gemm","127.0.0.1","cutlass3x_sm100_tensorop_s64x256x32gemm_f8_f8_f32_f16_f16_64x256x128_1x1x1_0_tnn_align4_1sm_bias_f16_relu_aux_scalemax","1","7","(384, 1, 1)","(9, 32, 1)","0","10.0","GPU and Memory Workload Distribution","Average L2 Active Cycles","cycle","791529.25",
"0","371693","cublaslt_gemm","127.0.0.1","cutlass3x_sm100_tensorop_s64x256x32gemm_f8_f8_f32_f16_f16_64x256x128_1x1x1_0_tnn_align4_1sm_bias_f16_relu_aux_scalemax","1","7","(384, 1, 1)","(9, 32, 1)","0","10.0","GPU and Memory Workload Distribution","Total L2 Elapsed Cycles","cycle","127183034",
"0","371693","cublaslt_gemm","127.0.0.1","cutlass3x_sm100_tensorop_s64x256x32gemm_f8_f8_f32_f16_f16_64x256x128_1x1x1_0_tnn_align4_1sm_bias_f16_relu_aux_scalemax","1","7","(384, 1, 1)","(9, 32, 1)","0","10.0","GPU and Memory Workload Distribution","Average SM Active Cycles","cycle","462061.91",
"0","371693","cublaslt_gemm","127.0.0.1","cutlass3x_sm100_tensorop_s64x256x32gemm_f8_f8_f32_f16_f16_64x256x128_1x1x1_0_tnn_align4_1sm_bias_f16_relu_aux_scalemax","1","7","(384, 1, 1)","(9, 32, 1)","0","10.0","GPU and Memory Workload Distribution","Total SM Elapsed Cycles","cycle","75238502",
"0","371693","cublaslt_gemm","127.0.0.1","cutlass3x_sm100_tensorop_s64x256x32gemm_f8_f8_f32_f16_f16_64x256x128_1x1x1_0_tnn_align4_1sm_bias_f16_relu_aux_scalemax","1","7","(384, 1, 1)","(9, 32, 1)","0","10.0","GPU and Memory Workload Distribution","Average SMSP Active Cycles","cycle","461606.38",
"0","371693","cublaslt_gemm","127.0.0.1","cutlass3x_sm100_tensorop_s64x256x32gemm_f8_f8_f32_f16_f16_64x256x128_1x1x1_0_tnn_align4_1sm_bias_f16_relu_aux_scalemax","1","7","(384, 1, 1)","(9, 32, 1)","0","10.0","GPU and Memory Workload Distribution","Total SMSP Elapsed Cycles","cycle","300954008",
"0","371693","cublaslt_gemm","127.0.0.1","cutlass3x_sm100_tensorop_s64x256x32gemm_f8_f8_f32_f16_f16_64x256x128_1x1x1_0_tnn_align4_1sm_bias_f16_relu_aux_scalemax","1","7","(384, 1, 1)","(9, 32, 1)","0","10.0","WorkloadDistribution","","","","WorkloadImbalance","OPT","One or more SMs have a much lower number of active cycles than the average number of active cycles. Maximum instance value is 6.23% above the average, while the minimum instance value is 40.85% below the average.","global","5.818"
"0","371693","cublaslt_gemm","127.0.0.1","cutlass3x_sm100_tensorop_s64x256x32gemm_f8_f8_f32_f16_f16_64x256x128_1x1x1_0_tnn_align4_1sm_bias_f16_relu_aux_scalemax","1","7","(384, 1, 1)","(9, 32, 1)","0","10.0","WorkloadDistribution","","","","WorkloadImbalance","OPT","One or more SMSPs have a much lower number of active cycles than the average number of active cycles. Maximum instance value is 6.69% above the average, while the minimum instance value is 41.54% below the average.","global","6.24"
"0","371693","cublaslt_gemm","127.0.0.1","cutlass3x_sm100_tensorop_s64x256x32gemm_f8_f8_f32_f16_f16_64x256x128_1x1x1_0_tnn_align4_1sm_bias_f16_relu_aux_scalemax","1","7","(384, 1, 1)","(9, 32, 1)","0","10.0","WorkloadDistribution","","","","WorkloadImbalance","OPT","One or more L1 Slices have a much lower number of active cycles than the average number of active cycles. Maximum instance value is 6.23% above the average, while the minimum instance value is 40.85% below the average.","global","5.818"
"0","371693","cublaslt_gemm","127.0.0.1","cutlass3x_sm100_tensorop_s64x256x32gemm_f8_f8_f32_f16_f16_64x256x128_1x1x1_0_tnn_align4_1sm_bias_f16_relu_aux_scalemax","1","7","(384, 1, 1)","(9, 32, 1)","0","10.0","WorkloadDistribution","","","","WorkloadImbalance","OPT","One or more L2 Slices have a much lower number of active cycles than the average number of active cycles. Maximum instance value is 12.44% above the average, while the minimum instance value is 19.49% below the average.","global","14.55"
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment