[INFO] args = Namespace(routine='BatchPrefillWithPagedKVCacheWrapper', no_cuda_graph=False, use_cupti=False, refcheck=True, allow_output_mismatch=True, random_seed=42, verbose=2, output_path=None, num_iters=30, dry_run_iters=5, case_tag='Llama-3.1-70B', generate_repro_command=True, repro_command='', backends=['fa2', 'fa3', 'cudnn', 'trtllm-gen'], page_size=16, batch_size=1, s_qo=1024, s_kv=1024, num_qo_heads=64, num_kv_heads=8, head_dim_qk=128, head_dim_vo=128, head_dim_ckv=None, head_dim_kpe=None, q_dtype='bfloat16', kv_dtype='bfloat16', causal=True, random_actual_seq_len=True)
[INFO] Running testBatchPrefillWithPagedKVCacheWrapper
[INFO] FlashInfer version: 0.3.1
[VVERBOSE] gpu_name = 'NVIDIA_B200'
[INFO] To reproduce this test case, run the following command: python3 flashinfer_benchmark.py --routine BatchPrefillWithPagedKVCacheWrapper --backends fa2 fa3 cudnn trtllm-gen --page_size 16 --batch_size 1 --s_qo 1024 --s_kv 1024 --num_qo_heads 64 --num_kv_heads 8 --head_dim_qk 128 --head_dim_vo 128 --random_actual_seq_len -vv --refcheck --causal --q_dtype bfloat16 --kv_dtype bfloat16 --allow_output_mismatch --generate_repro_command --case_tag Llama-3.1-70B
[WARNING] fa3 for routine BatchPrefillWithPagedKVCacheWrapper is not supported on compute capability 10.0. Skipping.
[VVERBOSE] s_qo == s_kv, making actual_seq_lens_kv the same as actual_seq_lens_q
[VERBOSE] Average actual qo seq len: 103
[VERBOSE] Average actual kv seq len: 103
[VVERBOSE] actual_seq_lens_q.flatten() = tensor([103], dtype=torch.int32)
[VVERBOSE] actual_seq_lens_kv.flatten() = tensor([103], dtype=torch.int32)
[VVERBOSE] q.shape = torch.Size([103, 64, 128])
[VVERBOSE] num_pages_per_seq = 64
[VVERBOSE] total_num_pages = 64
[VVERBOSE] kv_cache.shape = torch.Size([64, 2, 8, 16, 128])
[VVERBOSE] kv_cache.stride() = (32768, 16384, 128, 1024, 1)
[VVERBOSE] block_tables.shape = torch.Size([1, 64])
[VVERBOSE] qo_indptr.shape = torch.Size([2])
[VVERBOSE] qo_indptr.dtype = torch.int32
[VVERBOSE] kv_indptr.shape = torch.Size([2])
[VVERBOSE] kv_indices.shape = torch.Size([7])
[VVERBOSE] kv_last_page_len.shape = torch.Size([1])
[VVERBOSE] scale = 0.08838834764831843
[PERF] fa2            :: median time 0.012 ms; std 0.001 ms; achieved tflops 13.964 TFLOPs/sec; achieved tb_per_sec 0.305 TB/sec
[PERF] cudnn          :: median time 0.018 ms; std 0.000 ms; achieved tflops 9.452 TFLOPs/sec; achieved tb_per_sec 0.206 TB/sec
[PERF] trtllm-gen     :: median time 0.008 ms; std 0.000 ms; achieved tflops 20.700 TFLOPs/sec; achieved tb_per_sec 0.452 TB/sec
[INFO] args = Namespace(routine='BatchPrefillWithPagedKVCacheWrapper', no_cuda_graph=False, use_cupti=False, refcheck=True, allow_output_mismatch=True, random_seed=42, verbose=2, output_path=None, num_iters=30, dry_run_iters=5, case_tag='Llama-3.1-70B', generate_repro_command=True, repro_command='', backends=['fa2', 'fa3', 'cudnn', 'trtllm-gen'], page_size=16, batch_size=32, s_qo=1024, s_kv=1024, num_qo_heads=64, num_kv_heads=8, head_dim_qk=128, head_dim_vo=128, head_dim_ckv=None, head_dim_kpe=None, q_dtype='bfloat16', kv_dtype='bfloat16', causal=True, random_actual_seq_len=True)
[INFO] Running testBatchPrefillWithPagedKVCacheWrapper
[INFO] FlashInfer version: 0.3.1
[VVERBOSE] gpu_name = 'NVIDIA_B200'
[INFO] To reproduce this test case, run the following command: python3 flashinfer_benchmark.py --routine BatchPrefillWithPagedKVCacheWrapper --backends fa2 fa3 cudnn trtllm-gen --page_size 16 --batch_size 32 --s_qo 1024 --s_kv 1024 --num_qo_heads 64 --num_kv_heads 8 --head_dim_qk 128 --head_dim_vo 128 --random_actual_seq_len -vv --refcheck --causal --q_dtype bfloat16 --kv_dtype bfloat16 --allow_output_mismatch --generate_repro_command --case_tag Llama-3.1-70B
[WARNING] fa3 for routine BatchPrefillWithPagedKVCacheWrapper is not supported on compute capability 10.0. Skipping.
[VVERBOSE] s_qo == s_kv, making actual_seq_lens_kv the same as actual_seq_lens_q
[VERBOSE] Average actual qo seq len: 399
[VERBOSE] Average actual kv seq len: 399
[VVERBOSE] actual_seq_lens_q.flatten() = tensor([103, 436, 861, 271, 107,  72, 701,  21, 615, 122, 467, 215, 331, 459,
         88, 373, 100, 872, 664, 131, 662, 309, 770, 344, 492, 414, 806, 386,
        192, 956, 277, 161], dtype=torch.int32)
[VVERBOSE] actual_seq_lens_kv.flatten() = tensor([103, 436, 861, 271, 107,  72, 701,  21, 615, 122, 467, 215, 331, 459,
         88, 373, 100, 872, 664, 131, 662, 309, 770, 344, 492, 414, 806, 386,
        192, 956, 277, 161], dtype=torch.int32)
[VVERBOSE] q.shape = torch.Size([12778, 64, 128])
[VVERBOSE] num_pages_per_seq = 64
[VVERBOSE] total_num_pages = 2048
[VVERBOSE] kv_cache.shape = torch.Size([2048, 2, 8, 16, 128])
[VVERBOSE] kv_cache.stride() = (32768, 16384, 128, 1024, 1)
[VVERBOSE] block_tables.shape = torch.Size([32, 64])
[VVERBOSE] qo_indptr.shape = torch.Size([33])
[VVERBOSE] qo_indptr.dtype = torch.int32
[VVERBOSE] kv_indptr.shape = torch.Size([33])
[VVERBOSE] kv_indices.shape = torch.Size([815])
[VVERBOSE] kv_last_page_len.shape = torch.Size([32])
[VVERBOSE] scale = 0.08838834764831843
[PERF] fa2            :: median time 0.483 ms; std 0.003 ms; achieved tflops 250.421 TFLOPs/sec; achieved tb_per_sec 0.975 TB/sec
[PERF] cudnn          :: median time 0.382 ms; std 0.001 ms; achieved tflops 317.089 TFLOPs/sec; achieved tb_per_sec 1.234 TB/sec
[PERF] trtllm-gen     :: median time 0.744 ms; std 0.000 ms; achieved tflops 162.619 TFLOPs/sec; achieved tb_per_sec 0.633 TB/sec
[INFO] args = Namespace(routine='BatchPrefillWithRaggedKVCacheWrapper', no_cuda_graph=False, use_cupti=False, refcheck=True, allow_output_mismatch=True, random_seed=42, verbose=2, output_path=None, num_iters=30, dry_run_iters=5, case_tag='DeepSeek-R1', generate_repro_command=True, repro_command='', backends=['fa2', 'fa3', 'cutlass', 'cudnn'], page_size=0, batch_size=1, s_qo=1024, s_kv=1024, num_qo_heads=128, num_kv_heads=128, head_dim_qk=192, head_dim_vo=128, head_dim_ckv=None, head_dim_kpe=None, q_dtype='bfloat16', kv_dtype='bfloat16', causal=True, random_actual_seq_len=True)
[INFO] Running testBatchPrefillWithRaggedKVCacheWrapper
[INFO] FlashInfer version: 0.3.1
[VVERBOSE] gpu_name = 'NVIDIA_B200'
[INFO] To reproduce this test case, run the following command: python3 flashinfer_benchmark.py --routine BatchPrefillWithRaggedKVCacheWrapper --backends fa2 fa3 cutlass cudnn --batch_size 1 --s_qo 1024 --s_kv 1024 --num_qo_heads 128 --num_kv_heads 128 --head_dim_qk 192 --head_dim_vo 128 --random_actual_seq_len -vv --refcheck --causal --q_dtype bfloat16 --kv_dtype bfloat16 --allow_output_mismatch --generate_repro_command --case_tag DeepSeek-R1
[WARNING] fa3 for routine BatchPrefillWithRaggedKVCacheWrapper is not supported on compute capability 10.0. Skipping.
[VVERBOSE] s_qo == s_kv, making actual_seq_lens_kv the same as actual_seq_lens_q
[VERBOSE] Average actual qo seq len: 103
[VERBOSE] Average actual kv seq len: 103
[VVERBOSE] actual_seq_lens_q.flatten() = tensor([103], dtype=torch.int32)
[VVERBOSE] actual_seq_lens_kv.flatten() = tensor([103], dtype=torch.int32)
[VVERBOSE] q.shape = torch.Size([103, 128, 192])
[VVERBOSE] k.shape = torch.Size([103, 128, 192])
[VVERBOSE] v.shape = torch.Size([103, 128, 128])
[VVERBOSE] qo_indptr.shape = torch.Size([2])
[VVERBOSE] kv_indptr.shape = torch.Size([2])
[VVERBOSE] scale = 0.07216878364870323
[PERF] fa2            :: median time 0.016 ms; std 0.000 ms; achieved tflops 26.943 TFLOPs/sec; achieved tb_per_sec 1.046 TB/sec
[PERF] cutlass        :: median time 0.012 ms; std 0.000 ms; achieved tflops 35.963 TFLOPs/sec; achieved tb_per_sec 1.397 TB/sec
[PERF] cudnn          :: median time 0.019 ms; std 0.000 ms; achieved tflops 23.316 TFLOPs/sec; achieved tb_per_sec 0.905 TB/sec
[INFO] args = Namespace(routine='BatchPrefillWithRaggedKVCacheWrapper', no_cuda_graph=False, use_cupti=False, refcheck=True, allow_output_mismatch=True, random_seed=42, verbose=2, output_path=None, num_iters=30, dry_run_iters=5, case_tag='DeepSeek-R1', generate_repro_command=True, repro_command='', backends=['fa2', 'fa3', 'cutlass', 'cudnn'], page_size=0, batch_size=16, s_qo=1024, s_kv=1024, num_qo_heads=128, num_kv_heads=128, head_dim_qk=192, head_dim_vo=128, head_dim_ckv=None, head_dim_kpe=None, q_dtype='bfloat16', kv_dtype='bfloat16', causal=True, random_actual_seq_len=True)
[INFO] Running testBatchPrefillWithRaggedKVCacheWrapper
[INFO] FlashInfer version: 0.3.1
[VVERBOSE] gpu_name = 'NVIDIA_B200'
[INFO] To reproduce this test case, run the following command: python3 flashinfer_benchmark.py --routine BatchPrefillWithRaggedKVCacheWrapper --backends fa2 fa3 cutlass cudnn --batch_size 16 --s_qo 1024 --s_kv 1024 --num_qo_heads 128 --num_kv_heads 128 --head_dim_qk 192 --head_dim_vo 128 --random_actual_seq_len -vv --refcheck --causal --q_dtype bfloat16 --kv_dtype bfloat16 --allow_output_mismatch --generate_repro_command --case_tag DeepSeek-R1
[WARNING] fa3 for routine BatchPrefillWithRaggedKVCacheWrapper is not supported on compute capability 10.0. Skipping.
[VVERBOSE] s_qo == s_kv, making actual_seq_lens_kv the same as actual_seq_lens_q
[VERBOSE] Average actual qo seq len: 327
[VERBOSE] Average actual kv seq len: 327
[VVERBOSE] actual_seq_lens_q.flatten() = tensor([103, 436, 861, 271, 107,  72, 701,  21, 615, 122, 467, 215, 331, 459,
         88, 373], dtype=torch.int32)
[VVERBOSE] actual_seq_lens_kv.flatten() = tensor([103, 436, 861, 271, 107,  72, 701,  21, 615, 122, 467, 215, 331, 459,
         88, 373], dtype=torch.int32)
[VVERBOSE] q.shape = torch.Size([5242, 128, 192])
[VVERBOSE] k.shape = torch.Size([5242, 128, 192])
[VVERBOSE] v.shape = torch.Size([5242, 128, 128])
[VVERBOSE] qo_indptr.shape = torch.Size([17])
[VVERBOSE] kv_indptr.shape = torch.Size([17])
[VVERBOSE] scale = 0.07216878364870323
[PERF] fa2            :: median time 0.498 ms; std 0.005 ms; achieved tflops 217.968 TFLOPs/sec; achieved tb_per_sec 1.726 TB/sec
[PERF] cutlass        :: median time 0.533 ms; std 0.001 ms; achieved tflops 203.573 TFLOPs/sec; achieved tb_per_sec 1.612 TB/sec
[PERF] cudnn          :: median time 0.312 ms; std 0.001 ms; achieved tflops 347.342 TFLOPs/sec; achieved tb_per_sec 2.750 TB/sec
[INFO] args = Namespace(routine='BatchDecodeWithPagedKVCacheWrapper', no_cuda_graph=False, use_cupti=False, refcheck=True, allow_output_mismatch=True, random_seed=42, verbose=2, output_path=None, num_iters=30, dry_run_iters=5, case_tag='Llama-3.1-70B', generate_repro_command=True, repro_command='', backends=['fa2', 'fa2_tc', 'cudnn', 'trtllm-gen', 'trtllm-gen-native'], page_size=16, batch_size=1, s_qo=1, s_kv=1024, num_qo_heads=64, num_kv_heads=8, head_dim_qk=128, head_dim_vo=128, head_dim_ckv=None, head_dim_kpe=None, q_dtype='bfloat16', kv_dtype='bfloat16', causal=False, random_actual_seq_len=True)
[INFO] Running testBatchDecodeWithPagedKVCacheWrapper
[INFO] FlashInfer version: 0.3.1
[VVERBOSE] gpu_name = 'NVIDIA_B200'
[INFO] To reproduce this test case, run the following command: python3 flashinfer_benchmark.py --routine BatchDecodeWithPagedKVCacheWrapper --backends fa2 fa2_tc cudnn trtllm-gen trtllm-gen-native --page_size 16 --batch_size 1 --s_qo 1 --s_kv 1024 --num_qo_heads 64 --num_kv_heads 8 --head_dim_qk 128 --head_dim_vo 128 --random_actual_seq_len -vv --refcheck --q_dtype bfloat16 --kv_dtype bfloat16 --allow_output_mismatch --generate_repro_command --case_tag Llama-3.1-70B
[VERBOSE] Average actual seq len: 84
[VVERBOSE] actual_seq_lens_kv.flatten() = tensor([84], device='cuda:0', dtype=torch.int32)
[VVERBOSE] q.shape = torch.Size([1, 64, 128])
[VVERBOSE] num_pages_per_seq = 64
[VVERBOSE] total_num_pages = 64
[VVERBOSE] kv_cache.shape = torch.Size([64, 2, 8, 16, 128])
[VVERBOSE] kv_cache.stride() = (32768, 16384, 128, 1024, 1)
[VVERBOSE] block_tables.shape = torch.Size([1, 64])
[VVERBOSE] kv_indptr.shape = torch.Size([2])
[VVERBOSE] kv_indices.shape = torch.Size([6])
[VVERBOSE] kv_last_page_len.shape = torch.Size([1])
[VVERBOSE] scale = 0.08838834764831843
[ERROR] Output tensor mismatch between backends fa2 and cudnn: 5063 / 8192 (61.80%) elements are different
[PERF] fa2            :: median time 0.035 ms; std 0.000 ms; achieved tflops 0.079 TFLOPs/sec; achieved tb_per_sec 0.011 TB/sec
[PERF] fa2_tc         :: median time 0.010 ms; std 0.000 ms; achieved tflops 0.263 TFLOPs/sec; achieved tb_per_sec 0.036 TB/sec
[PERF] cudnn          :: median time 0.011 ms; std 0.000 ms; achieved tflops 0.258 TFLOPs/sec; achieved tb_per_sec 0.035 TB/sec
[PERF] trtllm-gen     :: median time 0.006 ms; std 0.000 ms; achieved tflops 0.480 TFLOPs/sec; achieved tb_per_sec 0.066 TB/sec
[PERF] trtllm-gen-nati:: median time 0.006 ms; std 0.000 ms; achieved tflops 0.445 TFLOPs/sec; achieved tb_per_sec 0.061 TB/sec
[INFO] args = Namespace(routine='BatchDecodeWithPagedKVCacheWrapper', no_cuda_graph=False, use_cupti=False, refcheck=True, allow_output_mismatch=True, random_seed=42, verbose=2, output_path=None, num_iters=30, dry_run_iters=5, case_tag='Llama-3.1-70B', generate_repro_command=True, repro_command='', backends=['fa2', 'fa2_tc', 'cudnn', 'trtllm-gen', 'trtllm-gen-native'], page_size=16, batch_size=16, s_qo=1, s_kv=1024, num_qo_heads=64, num_kv_heads=8, head_dim_qk=128, head_dim_vo=128, head_dim_ckv=None, head_dim_kpe=None, q_dtype='bfloat16', kv_dtype='bfloat16', causal=False, random_actual_seq_len=True)
[INFO] Running testBatchDecodeWithPagedKVCacheWrapper
[INFO] FlashInfer version: 0.3.1
[VVERBOSE] gpu_name = 'NVIDIA_B200'
[INFO] To reproduce this test case, run the following command: python3 flashinfer_benchmark.py --routine BatchDecodeWithPagedKVCacheWrapper --backends fa2 fa2_tc cudnn trtllm-gen trtllm-gen-native --page_size 16 --batch_size 16 --s_qo 1 --s_kv 1024 --num_qo_heads 64 --num_kv_heads 8 --head_dim_qk 128 --head_dim_vo 128 --random_actual_seq_len -vv --refcheck --q_dtype bfloat16 --kv_dtype bfloat16 --allow_output_mismatch --generate_repro_command --case_tag Llama-3.1-70B
[VERBOSE] Average actual seq len: 501
[VVERBOSE] actual_seq_lens_kv.flatten() = tensor([ 84, 874, 167, 691, 274, 736,  63, 813, 781, 450, 794, 226, 510, 499,
        524, 541], device='cuda:0', dtype=torch.int32)
[VVERBOSE] q.shape = torch.Size([16, 64, 128])
[VVERBOSE] num_pages_per_seq = 64
[VVERBOSE] total_num_pages = 1024
[VVERBOSE] kv_cache.shape = torch.Size([1024, 2, 8, 16, 128])
[VVERBOSE] kv_cache.stride() = (32768, 16384, 128, 1024, 1)
[VVERBOSE] block_tables.shape = torch.Size([16, 64])
[VVERBOSE] kv_indptr.shape = torch.Size([17])
[VVERBOSE] kv_indices.shape = torch.Size([509])
[VVERBOSE] kv_last_page_len.shape = torch.Size([16])
[VVERBOSE] scale = 0.08838834764831843
[PERF] fa2            :: median time 0.055 ms; std 0.000 ms; achieved tflops 4.757 TFLOPs/sec; achieved tb_per_sec 0.604 TB/sec
[PERF] fa2_tc         :: median time 0.017 ms; std 0.000 ms; achieved tflops 15.285 TFLOPs/sec; achieved tb_per_sec 1.941 TB/sec
[PERF] cudnn          :: median time 0.014 ms; std 0.000 ms; achieved tflops 19.109 TFLOPs/sec; achieved tb_per_sec 2.427 TB/sec
[PERF] trtllm-gen     :: median time 0.010 ms; std 0.000 ms; achieved tflops 27.308 TFLOPs/sec; achieved tb_per_sec 3.468 TB/sec
[PERF] trtllm-gen-nati:: median time 0.010 ms; std 0.000 ms; achieved tflops 27.235 TFLOPs/sec; achieved tb_per_sec 3.459 TB/sec
[INFO] args = Namespace(routine='BatchMLAPagedAttentionWrapper', no_cuda_graph=False, use_cupti=False, refcheck=True, allow_output_mismatch=False, random_seed=42, verbose=2, output_path=None, num_iters=30, dry_run_iters=5, case_tag='DeepSeek-R1', generate_repro_command=True, repro_command='', backends=['trtllm-gen-native', 'fa2', 'fa3'], page_size=32, batch_size=16, s_qo=1, s_kv=1024, num_qo_heads=128, num_kv_heads=128, head_dim_qk=None, head_dim_vo=None, head_dim_ckv=512, head_dim_kpe=64, q_dtype='bfloat16', kv_dtype='bfloat16', causal=False, random_actual_seq_len=True)
[INFO] Running testBatchMLAPagedAttentionWrapper
[INFO] FlashInfer version: 0.3.1
[VVERBOSE] gpu_name = 'NVIDIA_B200'
[INFO] To reproduce this test case, run the following command: python3 flashinfer_benchmark.py --routine BatchMLAPagedAttentionWrapper --backends trtllm-gen-native fa2 fa3 --page_size 32 --batch_size 16 --s_qo 1 --s_kv 1024 --num_qo_heads 128 --num_kv_heads 128 --head_dim_ckv 512 --head_dim_kpe 64 --random_actual_seq_len -vv --refcheck --q_dtype bfloat16 --kv_dtype bfloat16 --generate_repro_command --case_tag DeepSeek-R1
[WARNING] fa3 for routine BatchMLAPagedAttentionWrapper is not supported on compute capability 10.0. Skipping.
[VERBOSE] Average actual seq len: 501
[VVERBOSE] actual_seq_lens_kv.flatten() = tensor([ 84, 874, 167, 691, 274, 736,  63, 813, 781, 450, 794, 226, 510, 499,
        524, 541], device='cuda:0', dtype=torch.int32)
[VVERBOSE] q_nope.shape = torch.Size([16, 128, 512])
[VVERBOSE] q_pe.shape = torch.Size([16, 128, 64])
[VVERBOSE] q.shape = torch.Size([16, 128, 576])
[VVERBOSE] num_pages_per_seq = 32
[VVERBOSE] total_num_pages = 512
[VVERBOSE] block_tables.shape = torch.Size([16, 32])
[VVERBOSE] ckv_cache.shape = torch.Size([512, 32, 512])
[VVERBOSE] kpe_cache.shape = torch.Size([512, 32, 64])
[VVERBOSE] kv_cache.shape = torch.Size([512, 32, 576])
[VVERBOSE] qo_indptr.shape = torch.Size([17])
[VVERBOSE] kv_indptr.shape = torch.Size([17])
[VVERBOSE] kv_indices.shape = torch.Size([258])
[VVERBOSE] actual_seq_lens_kv.shape = torch.Size([16, 1, 1, 1])
[VVERBOSE] sm_scale = 0.041666666666666664
[VVERBOSE] workspace_buffer.shape = torch.Size([134217728])
[PERF] trtllm-gen-nati:: median time 0.024 ms; std 0.000 ms; achieved tflops 91.551 TFLOPs/sec; achieved tb_per_sec 0.955 TB/sec
[PERF] fa2            :: median time 0.041 ms; std 0.000 ms; achieved tflops 54.584 TFLOPs/sec; achieved tb_per_sec 0.570 TB/sec
[INFO] args = Namespace(routine='bmm_fp8', no_cuda_graph=False, use_cupti=False, refcheck=True, allow_output_mismatch=False, random_seed=42, verbose=2, output_path=None, num_iters=30, dry_run_iters=5, case_tag=None, generate_repro_command=True, repro_command='', batch_size=256, m=1, n=1024, k=7168, tile_size=128, group_size=1, scale_major_mode='MN', input_dtype='fp8_e4m3', mat2_dtype='fp8_e4m3', out_dtype='bfloat16', mma_sm=1, backends=['cudnn', 'cublas', 'cutlass'], use_128x4_sf_layout=False, use_nvfp4=False, autotune=False)
[INFO] Running testBmmFp8
[INFO] FlashInfer version: 0.3.1
[VVERBOSE] gpu_name = 'NVIDIA_B200'
[INFO] To reproduce this test case, run the following command: python3 flashinfer_benchmark.py --routine bmm_fp8 --batch_size 256 --m 1 --n 1024 --k 7168 --input_dtype fp8_e4m3 --mat2_dtype fp8_e4m3 --out_dtype bfloat16 --backends cudnn cublas cutlass --refcheck -vv --generate_repro_command
[VVERBOSE] input_fp8.shape = torch.Size([256, 1, 7168])
[VVERBOSE] input_fp8.dtype = torch.float8_e4m3fn
[VVERBOSE] mat2_fp8.shape = torch.Size([256, 7168, 1024])
[VVERBOSE] mat2_fp8.dtype = torch.float8_e4m3fn
[VVERBOSE] input_inv_s = tensor(0.0109, device='cuda:0')
[VVERBOSE] input_inv_s.dtype = torch.float32
[VVERBOSE] mat2_inv_s = tensor(0.0135, device='cuda:0')
[VVERBOSE] mat2_inv_s.dtype = torch.float32
[PERF] cudnn          :: median time 0.286 ms; std 0.000 ms; achieved tflops 13.138 TFLOPs/sec; achieved tb_per_sec 0.026 TB/sec
[PERF] cublas         :: median time 0.286 ms; std 0.000 ms; achieved tflops 13.140 TFLOPs/sec; achieved tb_per_sec 0.026 TB/sec
[PERF] cutlass        :: median time 0.266 ms; std 0.000 ms; achieved tflops 14.142 TFLOPs/sec; achieved tb_per_sec 0.028 TB/sec
[INFO] args = Namespace(routine='bmm_fp8', no_cuda_graph=False, use_cupti=False, refcheck=True, allow_output_mismatch=False, random_seed=42, verbose=2, output_path=None, num_iters=30, dry_run_iters=5, case_tag=None, generate_repro_command=True, repro_command='', batch_size=64, m=4, n=1024, k=7168, tile_size=128, group_size=1, scale_major_mode='MN', input_dtype='fp8_e4m3', mat2_dtype='fp8_e4m3', out_dtype='bfloat16', mma_sm=1, backends=['cudnn', 'cublas', 'cutlass'], use_128x4_sf_layout=False, use_nvfp4=False, autotune=False)
[INFO] Running testBmmFp8
[INFO] FlashInfer version: 0.3.1
[VVERBOSE] gpu_name = 'NVIDIA_B200'
[INFO] To reproduce this test case, run the following command: python3 flashinfer_benchmark.py --routine bmm_fp8 --batch_size 64 --m 4 --n 1024 --k 7168 --input_dtype fp8_e4m3 --mat2_dtype fp8_e4m3 --out_dtype bfloat16 --backends cudnn cublas cutlass --refcheck -vv --generate_repro_command
[VVERBOSE] input_fp8.shape = torch.Size([64, 4, 7168])
[VVERBOSE] input_fp8.dtype = torch.float8_e4m3fn
[VVERBOSE] mat2_fp8.shape = torch.Size([64, 7168, 1024])
[VVERBOSE] mat2_fp8.dtype = torch.float8_e4m3fn
[VVERBOSE] input_inv_s = tensor(0.0109, device='cuda:0')
[VVERBOSE] input_inv_s.dtype = torch.float32
[VVERBOSE] mat2_inv_s = tensor(0.0131, device='cuda:0')
[VVERBOSE] mat2_inv_s.dtype = torch.float32
[PERF] cudnn          :: median time 0.075 ms; std 0.000 ms; achieved tflops 49.999 TFLOPs/sec; achieved tb_per_sec 0.098 TB/sec
[PERF] cublas         :: median time 0.075 ms; std 0.000 ms; achieved tflops 50.135 TFLOPs/sec; achieved tb_per_sec 0.098 TB/sec
[PERF] cutlass        :: median time 0.072 ms; std 0.000 ms; achieved tflops 51.981 TFLOPs/sec; achieved tb_per_sec 0.102 TB/sec
[INFO] args = Namespace(routine='gemm_fp8_nt_groupwise', no_cuda_graph=False, use_cupti=False, refcheck=True, allow_output_mismatch=False, random_seed=42, verbose=2, output_path=None, num_iters=30, dry_run_iters=5, case_tag=None, generate_repro_command=True, repro_command='', batch_size=1, m=4, n=1024, k=7168, tile_size=128, group_size=1, scale_major_mode='MN', input_dtype='fp8_e4m3', mat2_dtype='fp8_e4m3', out_dtype='bfloat16', mma_sm=1, backends=['cutlass'], use_128x4_sf_layout=False, use_nvfp4=False, autotune=False)
[INFO] Running testGemmFp8NtGroupwise
[INFO] FlashInfer version: 0.3.1
[VVERBOSE] gpu_name = 'NVIDIA_B200'
[INFO] To reproduce this test case, run the following command: python3 flashinfer_benchmark.py --routine gemm_fp8_nt_groupwise --m 4 --n 1024 --k 7168 --mma_sm 1 --scale_major_mode MN --backends cutlass --refcheck -vv --generate_repro_command
[VVERBOSE] a_val.shape = torch.Size([4, 7168])
[VVERBOSE] b_val.shape = torch.Size([1024, 7168])
[VVERBOSE] a_fp8.shape = torch.Size([4, 7168])
[VVERBOSE] b_fp8.shape = torch.Size([1024, 7168])
[VVERBOSE] a_scale.shape = torch.Size([56, 4])
[VVERBOSE] b_scale.shape = torch.Size([56, 8])
[PERF] cutlass        :: median time 0.020 ms; std 0.000 ms; achieved tflops 2.987 TFLOPs/sec; achieved tb_per_sec 0.375 TB/sec
[INFO] args = Namespace(routine='gemm_fp8_nt_groupwise', no_cuda_graph=False, use_cupti=False, refcheck=True, allow_output_mismatch=False, random_seed=42, verbose=2, output_path=None, num_iters=30, dry_run_iters=5, case_tag=None, generate_repro_command=True, repro_command='', batch_size=1, m=16, n=1024, k=7168, tile_size=128, group_size=1, scale_major_mode='MN', input_dtype='fp8_e4m3', mat2_dtype='fp8_e4m3', out_dtype='bfloat16', mma_sm=1, backends=['cutlass'], use_128x4_sf_layout=False, use_nvfp4=False, autotune=False)
[INFO] Running testGemmFp8NtGroupwise
[INFO] FlashInfer version: 0.3.1
[VVERBOSE] gpu_name = 'NVIDIA_B200'
[INFO] To reproduce this test case, run the following command: python3 flashinfer_benchmark.py --routine gemm_fp8_nt_groupwise --m 16 --n 1024 --k 7168 --mma_sm 1 --scale_major_mode MN --backends cutlass --refcheck -vv --generate_repro_command
[VVERBOSE] a_val.shape = torch.Size([16, 7168])
[VVERBOSE] b_val.shape = torch.Size([1024, 7168])
[VVERBOSE] a_fp8.shape = torch.Size([16, 7168])
[VVERBOSE] b_fp8.shape = torch.Size([1024, 7168])
[VVERBOSE] a_scale.shape = torch.Size([56, 16])
[VVERBOSE] b_scale.shape = torch.Size([56, 8])
[PERF] cutlass        :: median time 0.020 ms; std 0.000 ms; achieved tflops 11.824 TFLOPs/sec; achieved tb_per_sec 0.377 TB/sec
[INFO] args = Namespace(routine='group_gemm_fp8_nt_groupwise', no_cuda_graph=False, use_cupti=False, refcheck=True, allow_output_mismatch=False, random_seed=42, verbose=2, output_path=None, num_iters=30, dry_run_iters=5, case_tag=None, generate_repro_command=True, repro_command='', batch_size=1, m=4, n=1024, k=7168, tile_size=128, group_size=2, scale_major_mode='MN', input_dtype='fp8_e4m3', mat2_dtype='fp8_e4m3', out_dtype='bfloat16', mma_sm=1, backends=['cudnn'], use_128x4_sf_layout=False, use_nvfp4=False, autotune=False)
[INFO] Running testGroupGemmFp8NtGroupwise
[INFO] FlashInfer version: 0.3.1
[VVERBOSE] gpu_name = 'NVIDIA_B200'
[INFO] To reproduce this test case, run the following command: python3 flashinfer_benchmark.py --routine group_gemm_fp8_nt_groupwise --m 4 --n 1024 --k 7168 --mma_sm 1 --group_size 2 --scale_major_mode MN --refcheck -vv --generate_repro_command
[VVERBOSE] a_val.shape = torch.Size([8, 7168])
[VVERBOSE] b_val.shape = torch.Size([2, 1024, 7168])
[VVERBOSE] a_fp8.shape = torch.Size([8, 7168])
[VVERBOSE] b_fp8.shape = torch.Size([2, 1024, 7168])
[VVERBOSE] a_scale.shape = torch.Size([56, 8])
[VVERBOSE] b_scale.shape = torch.Size([2, 56, 8])
[VVERBOSE] m_indptr.shape = torch.Size([3])
[PERF] cutlass        :: median time 0.022 ms; std 0.000 ms; achieved tflops 5.261 TFLOPs/sec; achieved tb_per_sec 0.661 TB/sec
[INFO] args = Namespace(routine='group_gemm_fp8_nt_groupwise', no_cuda_graph=False, use_cupti=False, refcheck=True, allow_output_mismatch=False, random_seed=42, verbose=2, output_path=None, num_iters=30, dry_run_iters=5, case_tag=None, generate_repro_command=True, repro_command='', batch_size=1, m=16, n=1024, k=7168, tile_size=128, group_size=2, scale_major_mode='MN', input_dtype='fp8_e4m3', mat2_dtype='fp8_e4m3', out_dtype='bfloat16', mma_sm=1, backends=['cudnn'], use_128x4_sf_layout=False, use_nvfp4=False, autotune=False)
[INFO] Running testGroupGemmFp8NtGroupwise
[INFO] FlashInfer version: 0.3.1
[VVERBOSE] gpu_name = 'NVIDIA_B200'
[INFO] To reproduce this test case, run the following command: python3 flashinfer_benchmark.py --routine group_gemm_fp8_nt_groupwise --m 16 --n 1024 --k 7168 --mma_sm 1 --group_size 2 --scale_major_mode MN --refcheck -vv --generate_repro_command
[VVERBOSE] a_val.shape = torch.Size([32, 7168])
[VVERBOSE] b_val.shape = torch.Size([2, 1024, 7168])
[VVERBOSE] a_fp8.shape = torch.Size([32, 7168])
[VVERBOSE] b_fp8.shape = torch.Size([2, 1024, 7168])
[VVERBOSE] a_scale.shape = torch.Size([56, 32])
[VVERBOSE] b_scale.shape = torch.Size([2, 56, 8])
[VVERBOSE] m_indptr.shape = torch.Size([3])
[PERF] cutlass        :: median time 0.023 ms; std 0.000 ms; achieved tflops 20.852 TFLOPs/sec; achieved tb_per_sec 0.665 TB/sec
[INFO] args = Namespace(routine='mm_fp4', no_cuda_graph=False, use_cupti=False, refcheck=True, allow_output_mismatch=False, random_seed=42, verbose=2, output_path=None, num_iters=30, dry_run_iters=5, case_tag=None, generate_repro_command=True, repro_command='', batch_size=1, m=1, n=1024, k=7168, tile_size=128, group_size=1, scale_major_mode='MN', input_dtype='fp8_e4m3', mat2_dtype='fp8_e4m3', out_dtype='bfloat16', mma_sm=1, backends=['cudnn', 'cutlass', 'trtllm'], use_128x4_sf_layout=True, use_nvfp4=True, autotune=False)
[INFO] Running testMmFp4
[INFO] FlashInfer version: 0.3.1
[VVERBOSE] gpu_name = 'NVIDIA_B200'
[INFO] To reproduce this test case, run the following command: python3 flashinfer_benchmark.py --routine mm_fp4 --m 1 --n 1024 --k 7168 --out_dtype bfloat16 --backends cudnn cutlass trtllm --use_128x4_sf_layout --use_nvfp4 --refcheck -vv --generate_repro_command
[VVERBOSE] input_fp4.shape = torch.Size([1, 3584])
[VVERBOSE] input_fp4.dtype = torch.uint8
[VVERBOSE] mat2_fp4.shape = torch.Size([1024, 3584])
[VVERBOSE] mat2_fp4.dtype = torch.uint8
[PERF] cudnn          :: median time 0.013 ms; std 0.000 ms; achieved tflops 1.156 TFLOPs/sec; achieved tb_per_sec 0.289 TB/sec
[PERF] cutlass        :: median time 0.009 ms; std 0.000 ms; achieved tflops 1.593 TFLOPs/sec; achieved tb_per_sec 0.399 TB/sec
[PERF] trtllm         :: median time 0.011 ms; std 0.000 ms; achieved tflops 1.352 TFLOPs/sec; achieved tb_per_sec 0.339 TB/sec
[INFO] args = Namespace(routine='mm_fp4', no_cuda_graph=False, use_cupti=False, refcheck=True, allow_output_mismatch=False, random_seed=42, verbose=2, output_path=None, num_iters=30, dry_run_iters=5, case_tag=None, generate_repro_command=True, repro_command='', batch_size=1, m=4, n=1024, k=7168, tile_size=128, group_size=1, scale_major_mode='MN', input_dtype='fp8_e4m3', mat2_dtype='fp8_e4m3', out_dtype='bfloat16', mma_sm=1, backends=['cudnn', 'cutlass', 'trtllm'], use_128x4_sf_layout=True, use_nvfp4=True, autotune=False)
[INFO] Running testMmFp4
[INFO] FlashInfer version: 0.3.1
[VVERBOSE] gpu_name = 'NVIDIA_B200'
[INFO] To reproduce this test case, run the following command: python3 flashinfer_benchmark.py --routine mm_fp4 --m 4 --n 1024 --k 7168 --out_dtype bfloat16 --backends cudnn cutlass trtllm --use_128x4_sf_layout --use_nvfp4 --refcheck -vv --generate_repro_command
[VVERBOSE] input_fp4.shape = torch.Size([4, 3584])
[VVERBOSE] input_fp4.dtype = torch.uint8
[VVERBOSE] mat2_fp4.shape = torch.Size([1024, 3584])
[VVERBOSE] mat2_fp4.dtype = torch.uint8
[PERF] cudnn          :: median time 0.013 ms; std 0.000 ms; achieved tflops 4.625 TFLOPs/sec; achieved tb_per_sec 0.291 TB/sec
[PERF] cutlass        :: median time 0.009 ms; std 0.000 ms; achieved tflops 6.372 TFLOPs/sec; achieved tb_per_sec 0.401 TB/sec
[PERF] trtllm         :: median time 0.011 ms; std 0.000 ms; achieved tflops 5.310 TFLOPs/sec; achieved tb_per_sec 0.334 TB/sec
[INFO] args = Namespace(routine='mm_fp4', no_cuda_graph=False, use_cupti=False, refcheck=True, allow_output_mismatch=False, random_seed=42, verbose=2, output_path=None, num_iters=30, dry_run_iters=5, case_tag=None, generate_repro_command=True, repro_command='', batch_size=1, m=4, n=1024, k=7168, tile_size=128, group_size=1, scale_major_mode='MN', input_dtype='fp8_e4m3', mat2_dtype='fp8_e4m3', out_dtype='bfloat16', mma_sm=1, backends=['cudnn', 'cutlass', 'trtllm'], use_128x4_sf_layout=True, use_nvfp4=True, autotune=True)
[INFO] Running testMmFp4
[INFO] FlashInfer version: 0.3.1
[VVERBOSE] gpu_name = 'NVIDIA_B200'
[INFO] To reproduce this test case, run the following command: python3 flashinfer_benchmark.py --routine mm_fp4 --m 4 --n 1024 --k 7168 --out_dtype bfloat16 --backends cudnn cutlass trtllm --use_128x4_sf_layout --use_nvfp4 --autotune --refcheck -vv --generate_repro_command
[INFO] cudnn backend does not support autotune
[VVERBOSE] input_fp4.shape = torch.Size([4, 3584])
[VVERBOSE] input_fp4.dtype = torch.uint8
[VVERBOSE] mat2_fp4.shape = torch.Size([1024, 3584])
[VVERBOSE] mat2_fp4.dtype = torch.uint8
[INFO] Autotune warmup for mm_fp4: 5 iters
2025-09-23 00:32:18,077 - INFO - autotuner.py:256 - flashinfer.jit: [Autotuner]: Autotuning process starts ...
2025-09-23 00:32:18,224 - INFO - autotuner.py:262 - flashinfer.jit: [Autotuner]: Autotuning process ends
[INFO] Autotune warmup for mm_fp4: 5 iters
2025-09-23 00:32:18,225 - INFO - autotuner.py:256 - flashinfer.jit: [Autotuner]: Autotuning process starts ...
2025-09-23 00:32:18,247 - INFO - autotuner.py:262 - flashinfer.jit: [Autotuner]: Autotuning process ends
[PERF] cutlass_autotun:: median time 0.009 ms; std 0.000 ms; achieved tflops 6.372 TFLOPs/sec; achieved tb_per_sec 0.401 TB/sec
[PERF] trtllm_autotune:: median time 0.011 ms; std 0.000 ms; achieved tflops 5.410 TFLOPs/sec; achieved tb_per_sec 0.340 TB/sec
[INFO] args = Namespace(routine='trtllm_fp4_block_scale_moe', no_cuda_graph=False, use_cupti=False, refcheck=False, allow_output_mismatch=False, random_seed=42, verbose=2, output_path=None, num_iters=30, dry_run_iters=5, case_tag='trtllm_moe_sample', generate_repro_command=True, repro_command='', num_tokens=1024, hidden_size=1024, intermediate_size=1024, num_experts=256, top_k=8, n_group=8, topk_group=4, routed_scaling_factor=2.5, local_expert_offset=0, local_num_experts=None, routing_method='deepseek_v3', use_shuffled_weight=True, weight_layout=0, use_routing_bias=True, use_routing_scales_on_input=False, input_dtype='bfloat16', weight_dtype='bfloat16', gated_act='swiglu', autotune=False, cutlass_variant='base', quantized_input=False, tp_size=1, tp_rank=0, ep_size=1, ep_rank=0, routing_method_type=2, gated_act_type=0)
[INFO] Running testTrtllmFp4BlockScaleMoe
[INFO] FlashInfer version: 0.3.1
[VVERBOSE] gpu_name = 'NVIDIA_B200'
[INFO] To reproduce this test case, run the following command: python3 flashinfer_benchmark.py --routine trtllm_fp4_block_scale_moe --num_tokens 1024 --hidden_size 1024 --intermediate_size 1024 --num_experts 256 --top_k 8 --n_group 8 --topk_group 4 --routed_scaling_factor 2.5 --use_routing_bias --routing_method deepseek_v3 --use_shuffled_weight -vv --generate_repro_command --case_tag trtllm_moe_sample
[INFO] Configuration: tokens=1024, hidden=1024, intermediate=1024, experts=256, top_k=8
[VVERBOSE] routing_logits.shape = torch.Size([1024, 256])
[VVERBOSE] hidden_states.shape = torch.Size([1024, 1024])
[VVERBOSE] gemm1_weights_fp4.shape = torch.Size([256, 2048, 512])
[VVERBOSE] gemm2_weights_fp4.shape = torch.Size([256, 1024, 512])
[PERF] trtllm         :: median time 0.224 ms; std 0.000 ms; achieved tflops 230.555 TFLOPs/sec; achieved tb_per_sec 1.818 TB/sec
[INFO] args = Namespace(routine='trtllm_fp4_block_scale_moe', no_cuda_graph=False, use_cupti=False, refcheck=False, allow_output_mismatch=False, random_seed=42, verbose=2, output_path=None, num_iters=30, dry_run_iters=5, case_tag='trtllm_moe_sample', generate_repro_command=True, repro_command='', num_tokens=1024, hidden_size=1024, intermediate_size=1024, num_experts=128, top_k=8, n_group=None, topk_group=None, routed_scaling_factor=2.5, local_expert_offset=0, local_num_experts=None, routing_method='renormalize_naive', use_shuffled_weight=True, weight_layout=0, use_routing_bias=False, use_routing_scales_on_input=False, input_dtype='bfloat16', weight_dtype='bfloat16', gated_act='swiglu', autotune=False, cutlass_variant='base', quantized_input=False, tp_size=1, tp_rank=0, ep_size=1, ep_rank=0, routing_method_type=4, gated_act_type=0)
[INFO] Running testTrtllmFp4BlockScaleMoe
[INFO] FlashInfer version: 0.3.1
[VVERBOSE] gpu_name = 'NVIDIA_B200'
[INFO] To reproduce this test case, run the following command: python3 flashinfer_benchmark.py --routine trtllm_fp4_block_scale_moe --num_tokens 1024 --hidden_size 1024 --intermediate_size 1024 --num_experts 128 --top_k 8 --routing_method renormalize_naive --use_shuffled_weight -vv --generate_repro_command --case_tag trtllm_moe_sample
[INFO] Configuration: tokens=1024, hidden=1024, intermediate=1024, experts=128, top_k=8
[VVERBOSE] routing_logits.shape = torch.Size([1024, 128])
[VVERBOSE] hidden_states.shape = torch.Size([1024, 1024])
[VVERBOSE] gemm1_weights_fp4.shape = torch.Size([128, 2048, 512])
[VVERBOSE] gemm2_weights_fp4.shape = torch.Size([128, 1024, 512])
[PERF] trtllm         :: median time 0.226 ms; std 0.000 ms; achieved tflops 227.846 TFLOPs/sec; achieved tb_per_sec 0.903 TB/sec
[INFO] args = Namespace(routine='trtllm_fp8_block_scale_moe', no_cuda_graph=False, use_cupti=False, refcheck=False, allow_output_mismatch=False, random_seed=42, verbose=2, output_path=None, num_iters=30, dry_run_iters=5, case_tag='trtllm_moe_sample', generate_repro_command=True, repro_command='', num_tokens=1024, hidden_size=1024, intermediate_size=1024, num_experts=256, top_k=8, n_group=8, topk_group=4, routed_scaling_factor=2.5, local_expert_offset=0, local_num_experts=None, routing_method='deepseek_v3', use_shuffled_weight=True, weight_layout=0, use_routing_bias=True, use_routing_scales_on_input=False, input_dtype='bfloat16', weight_dtype='bfloat16', gated_act='swiglu', autotune=False, cutlass_variant='base', quantized_input=False, tp_size=1, tp_rank=0, ep_size=1, ep_rank=0, routing_method_type=2, gated_act_type=0)
[INFO] Running testTrtllmFp8BlockScaleMoe
[INFO] FlashInfer version: 0.3.1
[VVERBOSE] gpu_name = 'NVIDIA_B200'
[INFO] To reproduce this test case, run the following command: python3 flashinfer_benchmark.py --routine trtllm_fp8_block_scale_moe --num_tokens 1024 --hidden_size 1024 --intermediate_size 1024 --num_experts 256 --top_k 8 --n_group 8 --topk_group 4 --routed_scaling_factor 2.5 --use_routing_bias --routing_method deepseek_v3 --use_shuffled_weight -vv --generate_repro_command --case_tag trtllm_moe_sample
[INFO] Configuration: tokens=1024, hidden=1024, intermediate=1024, experts=256, top_k=8
[VVERBOSE] routing_logits.shape = torch.Size([1024, 256])
[VVERBOSE] hidden_states.shape = torch.Size([1024, 1024])
[VVERBOSE] gemm1_weights_fp8.shape = torch.Size([256, 2048, 1024])
[VVERBOSE] gemm2_weights_fp8.shape = torch.Size([256, 1024, 1024])
[PERF] trtllm         :: median time 0.557 ms; std 0.000 ms; achieved tflops 92.607 TFLOPs/sec; achieved tb_per_sec 1.455 TB/sec
[INFO] args = Namespace(routine='trtllm_fp8_per_tensor_scale_moe', no_cuda_graph=False, use_cupti=False, refcheck=False, allow_output_mismatch=False, random_seed=42, verbose=2, output_path=None, num_iters=30, dry_run_iters=5, case_tag='trtllm_moe_sample', generate_repro_command=True, repro_command='', num_tokens=1024, hidden_size=1024, intermediate_size=1024, num_experts=128, top_k=1, n_group=None, topk_group=None, routed_scaling_factor=2.5, local_expert_offset=0, local_num_experts=None, routing_method='llama4', use_shuffled_weight=False, weight_layout=0, use_routing_bias=True, use_routing_scales_on_input=True, input_dtype='bfloat16', weight_dtype='bfloat16', gated_act='swiglu', autotune=False, cutlass_variant='base', quantized_input=False, tp_size=1, tp_rank=0, ep_size=1, ep_rank=0, routing_method_type=3, gated_act_type=0)
[INFO] Running testTrtllmFp8PerTensorScaleMoe
[INFO] FlashInfer version: 0.3.1
[VVERBOSE] gpu_name = 'NVIDIA_B200'
[INFO] To reproduce this test case, run the following command: python3 flashinfer_benchmark.py --routine trtllm_fp8_per_tensor_scale_moe --num_tokens 1024 --hidden_size 1024 --intermediate_size 1024 --num_experts 128 --top_k 1 --routed_scaling_factor 2.5 --use_routing_bias --routing_method llama4 --use_routing_scales_on_input -vv --generate_repro_command --case_tag trtllm_moe_sample
[INFO] Configuration: tokens=1024, hidden=1024, intermediate=1024, experts=128, top_k=1
[VVERBOSE] routing_logits.shape = torch.Size([1024, 128])
[VVERBOSE] hidden_states.shape = torch.Size([1024, 1024])
[VVERBOSE] gemm1_weights_fp8.shape = torch.Size([128, 2048, 1024])
[VVERBOSE] gemm2_weights_fp8.shape = torch.Size([128, 1024, 1024])
[PERF] trtllm         :: median time 0.123 ms; std 0.000 ms; achieved tflops 52.340 TFLOPs/sec; achieved tb_per_sec 3.299 TB/sec
[INFO] args = Namespace(routine='trtllm_fp8_block_scale_moe', no_cuda_graph=False, use_cupti=False, refcheck=False, allow_output_mismatch=False, random_seed=42, verbose=2, output_path=None, num_iters=30, dry_run_iters=5, case_tag='trtllm_moe_sample', generate_repro_command=True, repro_command='', num_tokens=1024, hidden_size=1024, intermediate_size=1024, num_experts=128, top_k=1, n_group=None, topk_group=None, routed_scaling_factor=2.5, local_expert_offset=0, local_num_experts=None, routing_method='renormalize', use_shuffled_weight=True, weight_layout=0, use_routing_bias=False, use_routing_scales_on_input=False, input_dtype='bfloat16', weight_dtype='bfloat16', gated_act='swiglu', autotune=False, cutlass_variant='base', quantized_input=False, tp_size=1, tp_rank=0, ep_size=1, ep_rank=0, routing_method_type=1, gated_act_type=0)
[INFO] Running testTrtllmFp8BlockScaleMoe
[INFO] FlashInfer version: 0.3.1
[VVERBOSE] gpu_name = 'NVIDIA_B200'
[INFO] To reproduce this test case, run the following command: python3 flashinfer_benchmark.py --routine trtllm_fp8_block_scale_moe --num_tokens 1024 --hidden_size 1024 --intermediate_size 1024 --num_experts 128 --top_k 1 --routing_method renormalize --use_shuffled_weight -vv --generate_repro_command --case_tag trtllm_moe_sample
[INFO] Configuration: tokens=1024, hidden=1024, intermediate=1024, experts=128, top_k=1
[VVERBOSE] routing_logits.shape = torch.Size([1024, 128])
[VVERBOSE] hidden_states.shape = torch.Size([1024, 1024])
[VVERBOSE] gemm1_weights_fp8.shape = torch.Size([128, 2048, 1024])
[VVERBOSE] gemm2_weights_fp8.shape = torch.Size([128, 1024, 1024])
[PERF] trtllm         :: median time 0.109 ms; std 0.000 ms; achieved tflops 59.297 TFLOPs/sec; achieved tb_per_sec 3.740 TB/sec
[INFO] args = Namespace(routine='cutlass_fused_moe', no_cuda_graph=False, use_cupti=False, refcheck=False, allow_output_mismatch=False, random_seed=42, verbose=2, output_path=None, num_iters=30, dry_run_iters=5, case_tag='cutlass_moe_base', generate_repro_command=True, repro_command='', num_tokens=32, hidden_size=128, intermediate_size=128, num_experts=2, top_k=2, n_group=None, topk_group=None, routed_scaling_factor=2.5, local_expert_offset=0, local_num_experts=None, routing_method='deepseek_v3', use_shuffled_weight=False, weight_layout=0, use_routing_bias=False, use_routing_scales_on_input=False, input_dtype='float16', weight_dtype='bfloat16', gated_act='swiglu', autotune=False, cutlass_variant='base', quantized_input=False, tp_size=1, tp_rank=0, ep_size=1, ep_rank=0, routing_method_type=2, gated_act_type=0)
[INFO] Running testCutlassFusedMoe
[INFO] FlashInfer version: 0.3.1
[VVERBOSE] gpu_name = 'NVIDIA_B200'
[INFO] To reproduce this test case, run the following command: python3 flashinfer_benchmark.py --routine cutlass_fused_moe --num_tokens 32 --hidden_size 128 --intermediate_size 128 --num_experts 2 --top_k 2 --cutlass_variant base --input_dtype float16 -vv --generate_repro_command --case_tag cutlass_moe_base
[VVERBOSE] x.shape = torch.Size([32, 128])
[VVERBOSE] w31_weight.shape = torch.Size([2, 256, 128])
[VVERBOSE] w2_weight.shape = torch.Size([2, 128, 128])
[PERF] cutlass        :: median time 0.026 ms; std 0.000 ms; achieved tflops 0.240 TFLOPs/sec; achieved tb_per_sec 0.008 TB/sec
[INFO] args = Namespace(routine='cutlass_fused_moe', no_cuda_graph=False, use_cupti=False, refcheck=False, allow_output_mismatch=False, random_seed=42, verbose=2, output_path=None, num_iters=30, dry_run_iters=5, case_tag='cutlass_moe_fp8_scale', generate_repro_command=True, repro_command='', num_tokens=32, hidden_size=128, intermediate_size=128, num_experts=2, top_k=2, n_group=None, topk_group=None, routed_scaling_factor=2.5, local_expert_offset=0, local_num_experts=None, tile_tokens_dim=8, routing_method='deepseek_v3', use_shuffled_weight=False, weight_layout=0, use_routing_bias=False, use_routing_scales_on_input=False, input_dtype='float16', weight_dtype='bfloat16', gated_act='swiglu', autotune=False, cutlass_variant='fp8', quantized_input=False, tp_size=1, tp_rank=0, ep_size=1, ep_rank=0, routing_method_type=2, gated_act_type=0)
[INFO] Running testCutlassFusedMoe
[INFO] FlashInfer version: 0.3.1
[VVERBOSE] gpu_name = 'NVIDIA_B200'
[INFO] To reproduce this test case, run the following command: python3 flashinfer_benchmark.py --routine cutlass_fused_moe --num_tokens 32 --hidden_size 128 --intermediate_size 128 --num_experts 2 --top_k 2 --cutlass_variant fp8 --input_dtype float16 -vv --generate_repro_command --case_tag cutlass_moe_fp8_scale
[VVERBOSE] x.shape = torch.Size([32, 128])
[VVERBOSE] w31_weight.shape = torch.Size([2, 256, 128])
[VVERBOSE] w2_weight.shape = torch.Size([2, 128, 128])
[PERF] cutlass        :: median time 0.026 ms; std 0.000 ms; achieved tflops 0.244 TFLOPs/sec; achieved tb_per_sec 0.004 TB/sec
[INFO] args = Namespace(routine='cutlass_fused_moe', no_cuda_graph=False, use_cupti=False, refcheck=False, allow_output_mismatch=False, random_seed=42, verbose=2, output_path=None, num_iters=30, dry_run_iters=5, case_tag='cutlass_moe_nvfp4_weights', generate_repro_command=True, repro_command='', num_tokens=32, hidden_size=128, intermediate_size=128, num_experts=2, top_k=2, n_group=None, topk_group=None, routed_scaling_factor=2.5, local_expert_offset=0, local_num_experts=None, tile_tokens_dim=8, routing_method='deepseek_v3', use_shuffled_weight=False, weight_layout=0, use_routing_bias=False, use_routing_scales_on_input=False, input_dtype='float16', weight_dtype='bfloat16', gated_act='swiglu', autotune=False, cutlass_variant='nvfp4', quantized_input=False, tp_size=1, tp_rank=0, ep_size=1, ep_rank=0, routing_method_type=2, gated_act_type=0)
[INFO] Running testCutlassFusedMoe
[INFO] FlashInfer version: 0.3.1
[VVERBOSE] gpu_name = 'NVIDIA_B200'
[INFO] To reproduce this test case, run the following command: python3 flashinfer_benchmark.py --routine cutlass_fused_moe --num_tokens 32 --hidden_size 128 --intermediate_size 128 --num_experts 2 --top_k 2 --cutlass_variant nvfp4 --input_dtype float16 -vv --generate_repro_command --case_tag cutlass_moe_nvfp4_weights
[VVERBOSE] x.shape = torch.Size([32, 128])
[VVERBOSE] w31_weight.shape = torch.Size([2, 256, 128])
[VVERBOSE] w2_weight.shape = torch.Size([2, 128, 128])
[PERF] cutlass        :: median time 0.030 ms; std 0.000 ms; achieved tflops 0.210 TFLOPs/sec; achieved tb_per_sec 0.002 TB/sec
[INFO] args = Namespace(routine='cutlass_fused_moe', no_cuda_graph=False, use_cupti=False, refcheck=False, allow_output_mismatch=False, random_seed=42, verbose=2, output_path=None, num_iters=30, dry_run_iters=5, case_tag='cutlass_moe_nvfp4_weights_quantized', generate_repro_command=True, repro_command='', num_tokens=32, hidden_size=128, intermediate_size=128, num_experts=2, top_k=2, n_group=None, topk_group=None, routed_scaling_factor=2.5, local_expert_offset=0, local_num_experts=None, tile_tokens_dim=8, routing_method='deepseek_v3', use_shuffled_weight=False, weight_layout=0, use_routing_bias=False, use_routing_scales_on_input=False, input_dtype='float16', weight_dtype='bfloat16', gated_act='swiglu', autotune=False, cutlass_variant='nvfp4', quantized_input=True, tp_size=1, tp_rank=0, ep_size=1, ep_rank=0, routing_method_type=2, gated_act_type=0)
[INFO] Running testCutlassFusedMoe
[INFO] FlashInfer version: 0.3.1
[VVERBOSE] gpu_name = 'NVIDIA_B200'
[INFO] To reproduce this test case, run the following command: python3 flashinfer_benchmark.py --routine cutlass_fused_moe --num_tokens 32 --hidden_size 128 --intermediate_size 128 --num_experts 2 --top_k 2 --cutlass_variant nvfp4 --quantized_input --input_dtype float16 -vv --generate_repro_command --case_tag cutlass_moe_nvfp4_weights_quantized
[VVERBOSE] x.shape = torch.Size([32, 128])
[VVERBOSE] w31_weight.shape = torch.Size([2, 256, 128])
[VVERBOSE] w2_weight.shape = torch.Size([2, 128, 128])
[PERF] cutlass        :: median time 0.029 ms; std 0.000 ms; achieved tflops 0.213 TFLOPs/sec; achieved tb_per_sec 0.002 TB/sec
[INFO] args = Namespace(routine='cutlass_fused_moe', no_cuda_graph=False, use_cupti=False, refcheck=False, allow_output_mismatch=False, random_seed=42, verbose=2, output_path=None, num_iters=30, dry_run_iters=5, case_tag='cutlass_moe_nvfp4_ep_tp', generate_repro_command=True, repro_command='', num_tokens=32, hidden_size=128, intermediate_size=128, num_experts=8, top_k=2, n_group=None, topk_group=None, routed_scaling_factor=2.5, local_expert_offset=0, local_num_experts=None, tile_tokens_dim=8, routing_method='deepseek_v3', use_shuffled_weight=False, weight_layout=0, use_routing_bias=False, use_routing_scales_on_input=False, input_dtype='float16', weight_dtype='bfloat16', gated_act='swiglu', autotune=False, cutlass_variant='base', quantized_input=False, tp_size=2, tp_rank=0, ep_size=4, ep_rank=0, routing_method_type=2, gated_act_type=0)
[INFO] Running testCutlassFusedMoe
[INFO] FlashInfer version: 0.3.1
[VVERBOSE] gpu_name = 'NVIDIA_B200'
[INFO] To reproduce this test case, run the following command: python3 flashinfer_benchmark.py --routine cutlass_fused_moe --num_tokens 32 --hidden_size 128 --intermediate_size 128 --num_experts 8 --top_k 2 --cutlass_variant base --input_dtype float16 --tp_size 2 --tp_rank 0 --ep_size 4 --ep_rank 0 -vv --generate_repro_command --case_tag cutlass_moe_nvfp4_ep_tp
[VVERBOSE] x.shape = torch.Size([32, 128])
[VVERBOSE] w31_weight.shape = torch.Size([8, 256, 128])
[VVERBOSE] w2_weight.shape = torch.Size([8, 128, 128])
[PERF] cutlass        :: median time 0.025 ms; std 0.000 ms; achieved tflops 0.250 TFLOPs/sec; achieved tb_per_sec 0.032 TB/sec
