cublaslt_ncu.log 18.2 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
==PROF== Connected to process 371693 (/opt/superbench/bin/cublaslt_gemm)
2208    2048    5608    0       358.154755      141.598150
==PROF== Disconnected from process 371693
"ID","Process ID","Process Name","Host Name","Kernel Name","Context","Stream","Block Size","Grid Size","Device","CC","Section Name","Metric Name","Metric Unit","Metric Value","Rule Name","Rule Type","Rule Description","Estimated Speedup Type","Estimated Speedup"
"0","371693","cublaslt_gemm","127.0.0.1","cutlass3x_sm100_tensorop_s64x256x32gemm_f8_f8_f32_f16_f16_64x256x128_1x1x1_0_tnn_align4_1sm_bias_f16_relu_aux_scalemax","1","7","(384, 1, 1)","(9, 32, 1)","0","10.0","GPU Speed Of Light Throughput","DRAM Frequency","hz","3995313115.40",
"0","371693","cublaslt_gemm","127.0.0.1","cutlass3x_sm100_tensorop_s64x256x32gemm_f8_f8_f32_f16_f16_64x256x128_1x1x1_0_tnn_align4_1sm_bias_f16_relu_aux_scalemax","1","7","(384, 1, 1)","(9, 32, 1)","0","10.0","GPU Speed Of Light Throughput","SM Frequency","hz","1239496049.13",
"0","371693","cublaslt_gemm","127.0.0.1","cutlass3x_sm100_tensorop_s64x256x32gemm_f8_f8_f32_f16_f16_64x256x128_1x1x1_0_tnn_align4_1sm_bias_f16_relu_aux_scalemax","1","7","(384, 1, 1)","(9, 32, 1)","0","10.0","GPU Speed Of Light Throughput","Elapsed Cycles","cycle","508757",
"0","371693","cublaslt_gemm","127.0.0.1","cutlass3x_sm100_tensorop_s64x256x32gemm_f8_f8_f32_f16_f16_64x256x128_1x1x1_0_tnn_align4_1sm_bias_f16_relu_aux_scalemax","1","7","(384, 1, 1)","(9, 32, 1)","0","10.0","GPU Speed Of Light Throughput","Memory Throughput","%","21.68",
"0","371693","cublaslt_gemm","127.0.0.1","cutlass3x_sm100_tensorop_s64x256x32gemm_f8_f8_f32_f16_f16_64x256x128_1x1x1_0_tnn_align4_1sm_bias_f16_relu_aux_scalemax","1","7","(384, 1, 1)","(9, 32, 1)","0","10.0","GPU Speed Of Light Throughput","DRAM Throughput","%","0.74",
"0","371693","cublaslt_gemm","127.0.0.1","cutlass3x_sm100_tensorop_s64x256x32gemm_f8_f8_f32_f16_f16_64x256x128_1x1x1_0_tnn_align4_1sm_bias_f16_relu_aux_scalemax","1","7","(384, 1, 1)","(9, 32, 1)","0","10.0","GPU Speed Of Light Throughput","Duration","ns","406240",
"0","371693","cublaslt_gemm","127.0.0.1","cutlass3x_sm100_tensorop_s64x256x32gemm_f8_f8_f32_f16_f16_64x256x128_1x1x1_0_tnn_align4_1sm_bias_f16_relu_aux_scalemax","1","7","(384, 1, 1)","(9, 32, 1)","0","10.0","GPU Speed Of Light Throughput","L1/TEX Cache Throughput","%","23.23",
"0","371693","cublaslt_gemm","127.0.0.1","cutlass3x_sm100_tensorop_s64x256x32gemm_f8_f8_f32_f16_f16_64x256x128_1x1x1_0_tnn_align4_1sm_bias_f16_relu_aux_scalemax","1","7","(384, 1, 1)","(9, 32, 1)","0","10.0","GPU Speed Of Light Throughput","L2 Cache Throughput","%","15.12",
"0","371693","cublaslt_gemm","127.0.0.1","cutlass3x_sm100_tensorop_s64x256x32gemm_f8_f8_f32_f16_f16_64x256x128_1x1x1_0_tnn_align4_1sm_bias_f16_relu_aux_scalemax","1","7","(384, 1, 1)","(9, 32, 1)","0","10.0","GPU Speed Of Light Throughput","SM Active Cycles","cycle","462061.91",
"0","371693","cublaslt_gemm","127.0.0.1","cutlass3x_sm100_tensorop_s64x256x32gemm_f8_f8_f32_f16_f16_64x256x128_1x1x1_0_tnn_align4_1sm_bias_f16_relu_aux_scalemax","1","7","(384, 1, 1)","(9, 32, 1)","0","10.0","GPU Speed Of Light Throughput","Compute (SM) Throughput","%","23.59",
"0","371693","cublaslt_gemm","127.0.0.1","cutlass3x_sm100_tensorop_s64x256x32gemm_f8_f8_f32_f16_f16_64x256x128_1x1x1_0_tnn_align4_1sm_bias_f16_relu_aux_scalemax","1","7","(384, 1, 1)","(9, 32, 1)","0","10.0","SpeedOfLight","","","","SOLBottleneck","OPT","This workload exhibits low compute throughput and memory bandwidth utilization relative to the peak performance of this device. Achieved compute throughput and/or memory bandwidth below 60.0% of peak typically indicate latency issues. Look at Scheduler Statistics and Warp State Statistics for potential reasons.","",""
"0","371693","cublaslt_gemm","127.0.0.1","cutlass3x_sm100_tensorop_s64x256x32gemm_f8_f8_f32_f16_f16_64x256x128_1x1x1_0_tnn_align4_1sm_bias_f16_relu_aux_scalemax","1","7","(384, 1, 1)","(9, 32, 1)","0","10.0","Launch Statistics","Block Size","","384",
"0","371693","cublaslt_gemm","127.0.0.1","cutlass3x_sm100_tensorop_s64x256x32gemm_f8_f8_f32_f16_f16_64x256x128_1x1x1_0_tnn_align4_1sm_bias_f16_relu_aux_scalemax","1","7","(384, 1, 1)","(9, 32, 1)","0","10.0","Launch Statistics","Cluster Scheduling Policy","","PolicySpread",
"0","371693","cublaslt_gemm","127.0.0.1","cutlass3x_sm100_tensorop_s64x256x32gemm_f8_f8_f32_f16_f16_64x256x128_1x1x1_0_tnn_align4_1sm_bias_f16_relu_aux_scalemax","1","7","(384, 1, 1)","(9, 32, 1)","0","10.0","Launch Statistics","Cluster Size","","0",
"0","371693","cublaslt_gemm","127.0.0.1","cutlass3x_sm100_tensorop_s64x256x32gemm_f8_f8_f32_f16_f16_64x256x128_1x1x1_0_tnn_align4_1sm_bias_f16_relu_aux_scalemax","1","7","(384, 1, 1)","(9, 32, 1)","0","10.0","Launch Statistics","Function Cache Configuration","","CachePreferNone",
"0","371693","cublaslt_gemm","127.0.0.1","cutlass3x_sm100_tensorop_s64x256x32gemm_f8_f8_f32_f16_f16_64x256x128_1x1x1_0_tnn_align4_1sm_bias_f16_relu_aux_scalemax","1","7","(384, 1, 1)","(9, 32, 1)","0","10.0","Launch Statistics","Grid Size","","288",
"0","371693","cublaslt_gemm","127.0.0.1","cutlass3x_sm100_tensorop_s64x256x32gemm_f8_f8_f32_f16_f16_64x256x128_1x1x1_0_tnn_align4_1sm_bias_f16_relu_aux_scalemax","1","7","(384, 1, 1)","(9, 32, 1)","0","10.0","Launch Statistics","Registers Per Thread","register/thread","168",
"0","371693","cublaslt_gemm","127.0.0.1","cutlass3x_sm100_tensorop_s64x256x32gemm_f8_f8_f32_f16_f16_64x256x128_1x1x1_0_tnn_align4_1sm_bias_f16_relu_aux_scalemax","1","7","(384, 1, 1)","(9, 32, 1)","0","10.0","Launch Statistics","Shared Memory Configuration Size","byte","233472",
"0","371693","cublaslt_gemm","127.0.0.1","cutlass3x_sm100_tensorop_s64x256x32gemm_f8_f8_f32_f16_f16_64x256x128_1x1x1_0_tnn_align4_1sm_bias_f16_relu_aux_scalemax","1","7","(384, 1, 1)","(9, 32, 1)","0","10.0","Launch Statistics","Driver Shared Memory Per Block","byte/block","1024",
"0","371693","cublaslt_gemm","127.0.0.1","cutlass3x_sm100_tensorop_s64x256x32gemm_f8_f8_f32_f16_f16_64x256x128_1x1x1_0_tnn_align4_1sm_bias_f16_relu_aux_scalemax","1","7","(384, 1, 1)","(9, 32, 1)","0","10.0","Launch Statistics","Dynamic Shared Memory Per Block","byte/block","205696",
"0","371693","cublaslt_gemm","127.0.0.1","cutlass3x_sm100_tensorop_s64x256x32gemm_f8_f8_f32_f16_f16_64x256x128_1x1x1_0_tnn_align4_1sm_bias_f16_relu_aux_scalemax","1","7","(384, 1, 1)","(9, 32, 1)","0","10.0","Launch Statistics","Static Shared Memory Per Block","byte/block","0",
"0","371693","cublaslt_gemm","127.0.0.1","cutlass3x_sm100_tensorop_s64x256x32gemm_f8_f8_f32_f16_f16_64x256x128_1x1x1_0_tnn_align4_1sm_bias_f16_relu_aux_scalemax","1","7","(384, 1, 1)","(9, 32, 1)","0","10.0","Launch Statistics","# SMs","SM","152",
"0","371693","cublaslt_gemm","127.0.0.1","cutlass3x_sm100_tensorop_s64x256x32gemm_f8_f8_f32_f16_f16_64x256x128_1x1x1_0_tnn_align4_1sm_bias_f16_relu_aux_scalemax","1","7","(384, 1, 1)","(9, 32, 1)","0","10.0","Launch Statistics","Stack Size","","1760",
"0","371693","cublaslt_gemm","127.0.0.1","cutlass3x_sm100_tensorop_s64x256x32gemm_f8_f8_f32_f16_f16_64x256x128_1x1x1_0_tnn_align4_1sm_bias_f16_relu_aux_scalemax","1","7","(384, 1, 1)","(9, 32, 1)","0","10.0","Launch Statistics","Threads","thread","110592",
"0","371693","cublaslt_gemm","127.0.0.1","cutlass3x_sm100_tensorop_s64x256x32gemm_f8_f8_f32_f16_f16_64x256x128_1x1x1_0_tnn_align4_1sm_bias_f16_relu_aux_scalemax","1","7","(384, 1, 1)","(9, 32, 1)","0","10.0","Launch Statistics","# TPCs","","76",
"0","371693","cublaslt_gemm","127.0.0.1","cutlass3x_sm100_tensorop_s64x256x32gemm_f8_f8_f32_f16_f16_64x256x128_1x1x1_0_tnn_align4_1sm_bias_f16_relu_aux_scalemax","1","7","(384, 1, 1)","(9, 32, 1)","0","10.0","Launch Statistics","Enabled TPC IDs","","all",
"0","371693","cublaslt_gemm","127.0.0.1","cutlass3x_sm100_tensorop_s64x256x32gemm_f8_f8_f32_f16_f16_64x256x128_1x1x1_0_tnn_align4_1sm_bias_f16_relu_aux_scalemax","1","7","(384, 1, 1)","(9, 32, 1)","0","10.0","Launch Statistics","Uses Green Context","","0",
"0","371693","cublaslt_gemm","127.0.0.1","cutlass3x_sm100_tensorop_s64x256x32gemm_f8_f8_f32_f16_f16_64x256x128_1x1x1_0_tnn_align4_1sm_bias_f16_relu_aux_scalemax","1","7","(384, 1, 1)","(9, 32, 1)","0","10.0","Launch Statistics","Waves Per SM","","1.89",
"0","371693","cublaslt_gemm","127.0.0.1","cutlass3x_sm100_tensorop_s64x256x32gemm_f8_f8_f32_f16_f16_64x256x128_1x1x1_0_tnn_align4_1sm_bias_f16_relu_aux_scalemax","1","7","(384, 1, 1)","(9, 32, 1)","0","10.0","LaunchStats","","","","LaunchConfiguration","OPT","If you execute __syncthreads() to synchronize the threads of a block, it is recommended to have at least two blocks per multiprocessor (compared to the currently executed 1.9 blocks) This way, blocks that aren't waiting for __syncthreads() can keep the hardware busy.","",""
"0","371693","cublaslt_gemm","127.0.0.1","cutlass3x_sm100_tensorop_s64x256x32gemm_f8_f8_f32_f16_f16_64x256x128_1x1x1_0_tnn_align4_1sm_bias_f16_relu_aux_scalemax","1","7","(384, 1, 1)","(9, 32, 1)","0","10.0","LaunchStats","","","","LaunchConfiguration","OPT","A wave of thread blocks is defined as the maximum number of blocks that can be executed in parallel on the target GPU. The number of blocks in a wave depends on the number of multiprocessors and the theoretical occupancy of the kernel. This kernel launch results in 1 full waves and a partial wave of 136 thread blocks. Under the assumption of a uniform execution duration of all thread blocks, this partial wave may account for up to 50.0% of the total runtime of this kernel. Try launching a grid with no partial wave. The overall impact of this tail effect also lessens with the number of full waves executed for a grid. See the Hardware Model (https://docs.nvidia.com/nsight-compute/ProfilingGuide/index.html#metrics-hw-model) description for more details on launch configurations.","global","50"
"0","371693","cublaslt_gemm","127.0.0.1","cutlass3x_sm100_tensorop_s64x256x32gemm_f8_f8_f32_f16_f16_64x256x128_1x1x1_0_tnn_align4_1sm_bias_f16_relu_aux_scalemax","1","7","(384, 1, 1)","(9, 32, 1)","0","10.0","Occupancy","Max Active Clusters","cluster","0",
"0","371693","cublaslt_gemm","127.0.0.1","cutlass3x_sm100_tensorop_s64x256x32gemm_f8_f8_f32_f16_f16_64x256x128_1x1x1_0_tnn_align4_1sm_bias_f16_relu_aux_scalemax","1","7","(384, 1, 1)","(9, 32, 1)","0","10.0","Occupancy","Max Cluster Size","block","8",
"0","371693","cublaslt_gemm","127.0.0.1","cutlass3x_sm100_tensorop_s64x256x32gemm_f8_f8_f32_f16_f16_64x256x128_1x1x1_0_tnn_align4_1sm_bias_f16_relu_aux_scalemax","1","7","(384, 1, 1)","(9, 32, 1)","0","10.0","Occupancy","Overall GPU Occupancy","%","0",
"0","371693","cublaslt_gemm","127.0.0.1","cutlass3x_sm100_tensorop_s64x256x32gemm_f8_f8_f32_f16_f16_64x256x128_1x1x1_0_tnn_align4_1sm_bias_f16_relu_aux_scalemax","1","7","(384, 1, 1)","(9, 32, 1)","0","10.0","Occupancy","Cluster Occupancy","%","0",
"0","371693","cublaslt_gemm","127.0.0.1","cutlass3x_sm100_tensorop_s64x256x32gemm_f8_f8_f32_f16_f16_64x256x128_1x1x1_0_tnn_align4_1sm_bias_f16_relu_aux_scalemax","1","7","(384, 1, 1)","(9, 32, 1)","0","10.0","Occupancy","Block Limit Barriers","block","9",
"0","371693","cublaslt_gemm","127.0.0.1","cutlass3x_sm100_tensorop_s64x256x32gemm_f8_f8_f32_f16_f16_64x256x128_1x1x1_0_tnn_align4_1sm_bias_f16_relu_aux_scalemax","1","7","(384, 1, 1)","(9, 32, 1)","0","10.0","Occupancy","Block Limit SM","block","32",
"0","371693","cublaslt_gemm","127.0.0.1","cutlass3x_sm100_tensorop_s64x256x32gemm_f8_f8_f32_f16_f16_64x256x128_1x1x1_0_tnn_align4_1sm_bias_f16_relu_aux_scalemax","1","7","(384, 1, 1)","(9, 32, 1)","0","10.0","Occupancy","Block Limit Registers","block","1",
"0","371693","cublaslt_gemm","127.0.0.1","cutlass3x_sm100_tensorop_s64x256x32gemm_f8_f8_f32_f16_f16_64x256x128_1x1x1_0_tnn_align4_1sm_bias_f16_relu_aux_scalemax","1","7","(384, 1, 1)","(9, 32, 1)","0","10.0","Occupancy","Block Limit Shared Mem","block","1",
"0","371693","cublaslt_gemm","127.0.0.1","cutlass3x_sm100_tensorop_s64x256x32gemm_f8_f8_f32_f16_f16_64x256x128_1x1x1_0_tnn_align4_1sm_bias_f16_relu_aux_scalemax","1","7","(384, 1, 1)","(9, 32, 1)","0","10.0","Occupancy","Block Limit Warps","block","5",
"0","371693","cublaslt_gemm","127.0.0.1","cutlass3x_sm100_tensorop_s64x256x32gemm_f8_f8_f32_f16_f16_64x256x128_1x1x1_0_tnn_align4_1sm_bias_f16_relu_aux_scalemax","1","7","(384, 1, 1)","(9, 32, 1)","0","10.0","Occupancy","Theoretical Active Warps per SM","warp","12",
"0","371693","cublaslt_gemm","127.0.0.1","cutlass3x_sm100_tensorop_s64x256x32gemm_f8_f8_f32_f16_f16_64x256x128_1x1x1_0_tnn_align4_1sm_bias_f16_relu_aux_scalemax","1","7","(384, 1, 1)","(9, 32, 1)","0","10.0","Occupancy","Theoretical Occupancy","%","18.75",
"0","371693","cublaslt_gemm","127.0.0.1","cutlass3x_sm100_tensorop_s64x256x32gemm_f8_f8_f32_f16_f16_64x256x128_1x1x1_0_tnn_align4_1sm_bias_f16_relu_aux_scalemax","1","7","(384, 1, 1)","(9, 32, 1)","0","10.0","Occupancy","Achieved Occupancy","%","15.53",
"0","371693","cublaslt_gemm","127.0.0.1","cutlass3x_sm100_tensorop_s64x256x32gemm_f8_f8_f32_f16_f16_64x256x128_1x1x1_0_tnn_align4_1sm_bias_f16_relu_aux_scalemax","1","7","(384, 1, 1)","(9, 32, 1)","0","10.0","Occupancy","Achieved Active Warps Per SM","warp","9.94",
"0","371693","cublaslt_gemm","127.0.0.1","cutlass3x_sm100_tensorop_s64x256x32gemm_f8_f8_f32_f16_f16_64x256x128_1x1x1_0_tnn_align4_1sm_bias_f16_relu_aux_scalemax","1","7","(384, 1, 1)","(9, 32, 1)","0","10.0","Occupancy","","","","TheoreticalOccupancy","OPT","The 3.00 theoretical warps per scheduler this kernel can issue according to its occupancy are below the hardware maximum of 16. This kernel's theoretical occupancy (18.8%) is limited by the number of required registers, and the required amount of shared memory.","local","81.25"
"0","371693","cublaslt_gemm","127.0.0.1","cutlass3x_sm100_tensorop_s64x256x32gemm_f8_f8_f32_f16_f16_64x256x128_1x1x1_0_tnn_align4_1sm_bias_f16_relu_aux_scalemax","1","7","(384, 1, 1)","(9, 32, 1)","0","10.0","GPU and Memory Workload Distribution","Average DRAM Active Cycles","cycle","12018",
"0","371693","cublaslt_gemm","127.0.0.1","cutlass3x_sm100_tensorop_s64x256x32gemm_f8_f8_f32_f16_f16_64x256x128_1x1x1_0_tnn_align4_1sm_bias_f16_relu_aux_scalemax","1","7","(384, 1, 1)","(9, 32, 1)","0","10.0","GPU and Memory Workload Distribution","Total DRAM Elapsed Cycles","cycle","103875584",
"0","371693","cublaslt_gemm","127.0.0.1","cutlass3x_sm100_tensorop_s64x256x32gemm_f8_f8_f32_f16_f16_64x256x128_1x1x1_0_tnn_align4_1sm_bias_f16_relu_aux_scalemax","1","7","(384, 1, 1)","(9, 32, 1)","0","10.0","GPU and Memory Workload Distribution","Average L1 Active Cycles","cycle","462061.91",
"0","371693","cublaslt_gemm","127.0.0.1","cutlass3x_sm100_tensorop_s64x256x32gemm_f8_f8_f32_f16_f16_64x256x128_1x1x1_0_tnn_align4_1sm_bias_f16_relu_aux_scalemax","1","7","(384, 1, 1)","(9, 32, 1)","0","10.0","GPU and Memory Workload Distribution","Total L1 Elapsed Cycles","cycle","75238502",
"0","371693","cublaslt_gemm","127.0.0.1","cutlass3x_sm100_tensorop_s64x256x32gemm_f8_f8_f32_f16_f16_64x256x128_1x1x1_0_tnn_align4_1sm_bias_f16_relu_aux_scalemax","1","7","(384, 1, 1)","(9, 32, 1)","0","10.0","GPU and Memory Workload Distribution","Average L2 Active Cycles","cycle","791529.25",
"0","371693","cublaslt_gemm","127.0.0.1","cutlass3x_sm100_tensorop_s64x256x32gemm_f8_f8_f32_f16_f16_64x256x128_1x1x1_0_tnn_align4_1sm_bias_f16_relu_aux_scalemax","1","7","(384, 1, 1)","(9, 32, 1)","0","10.0","GPU and Memory Workload Distribution","Total L2 Elapsed Cycles","cycle","127183034",
"0","371693","cublaslt_gemm","127.0.0.1","cutlass3x_sm100_tensorop_s64x256x32gemm_f8_f8_f32_f16_f16_64x256x128_1x1x1_0_tnn_align4_1sm_bias_f16_relu_aux_scalemax","1","7","(384, 1, 1)","(9, 32, 1)","0","10.0","GPU and Memory Workload Distribution","Average SM Active Cycles","cycle","462061.91",
"0","371693","cublaslt_gemm","127.0.0.1","cutlass3x_sm100_tensorop_s64x256x32gemm_f8_f8_f32_f16_f16_64x256x128_1x1x1_0_tnn_align4_1sm_bias_f16_relu_aux_scalemax","1","7","(384, 1, 1)","(9, 32, 1)","0","10.0","GPU and Memory Workload Distribution","Total SM Elapsed Cycles","cycle","75238502",
"0","371693","cublaslt_gemm","127.0.0.1","cutlass3x_sm100_tensorop_s64x256x32gemm_f8_f8_f32_f16_f16_64x256x128_1x1x1_0_tnn_align4_1sm_bias_f16_relu_aux_scalemax","1","7","(384, 1, 1)","(9, 32, 1)","0","10.0","GPU and Memory Workload Distribution","Average SMSP Active Cycles","cycle","461606.38",
"0","371693","cublaslt_gemm","127.0.0.1","cutlass3x_sm100_tensorop_s64x256x32gemm_f8_f8_f32_f16_f16_64x256x128_1x1x1_0_tnn_align4_1sm_bias_f16_relu_aux_scalemax","1","7","(384, 1, 1)","(9, 32, 1)","0","10.0","GPU and Memory Workload Distribution","Total SMSP Elapsed Cycles","cycle","300954008",
"0","371693","cublaslt_gemm","127.0.0.1","cutlass3x_sm100_tensorop_s64x256x32gemm_f8_f8_f32_f16_f16_64x256x128_1x1x1_0_tnn_align4_1sm_bias_f16_relu_aux_scalemax","1","7","(384, 1, 1)","(9, 32, 1)","0","10.0","WorkloadDistribution","","","","WorkloadImbalance","OPT","One or more SMs have a much lower number of active cycles than the average number of active cycles. Maximum instance value is 6.23% above the average, while the minimum instance value is 40.85% below the average.","global","5.818"
"0","371693","cublaslt_gemm","127.0.0.1","cutlass3x_sm100_tensorop_s64x256x32gemm_f8_f8_f32_f16_f16_64x256x128_1x1x1_0_tnn_align4_1sm_bias_f16_relu_aux_scalemax","1","7","(384, 1, 1)","(9, 32, 1)","0","10.0","WorkloadDistribution","","","","WorkloadImbalance","OPT","One or more SMSPs have a much lower number of active cycles than the average number of active cycles. Maximum instance value is 6.69% above the average, while the minimum instance value is 41.54% below the average.","global","6.24"
"0","371693","cublaslt_gemm","127.0.0.1","cutlass3x_sm100_tensorop_s64x256x32gemm_f8_f8_f32_f16_f16_64x256x128_1x1x1_0_tnn_align4_1sm_bias_f16_relu_aux_scalemax","1","7","(384, 1, 1)","(9, 32, 1)","0","10.0","WorkloadDistribution","","","","WorkloadImbalance","OPT","One or more L1 Slices have a much lower number of active cycles than the average number of active cycles. Maximum instance value is 6.23% above the average, while the minimum instance value is 40.85% below the average.","global","5.818"
"0","371693","cublaslt_gemm","127.0.0.1","cutlass3x_sm100_tensorop_s64x256x32gemm_f8_f8_f32_f16_f16_64x256x128_1x1x1_0_tnn_align4_1sm_bias_f16_relu_aux_scalemax","1","7","(384, 1, 1)","(9, 32, 1)","0","10.0","WorkloadDistribution","","","","WorkloadImbalance","OPT","One or more L2 Slices have a much lower number of active cycles than the average number of active cycles. Maximum instance value is 12.44% above the average, while the minimum instance value is 19.49% below the average.","global","14.55"