sglang_engine_model.json 2.08 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
{
  "sglang": {
    "llama": {
      "gemm|nvjet": "gemm",
      "fused_moe_kernel|GroupProblemShape|group_gemm_starts|bmm_|GemmUniversal": "moe_gemm",
      "moe|sigmoid": "moe",
      "CatArrayBatched|prepare_inputs": "prepare_next",
      "ncclDevKernel|cross_device_reduce": "nccl_and_custom_ar",
      "_norm_|Norm": "norm",
      "topk": "topk",
      "act_and_mul_": "activation",
      "Rotary": "rope",
      "SoftMax": "softmax",
      "flash|fmha": "attn",
      "elementwise": "elementwise",
      "fp8_quant|cvt_|quantize": "quantize",
      "reduce_kernel": "reduce",
      "triton": "triton_kernel",
      "CUDA mem": "non-gpu-H_D_memops",
      ".*": "misc"
    },
    "ds": {
      "block_fp8_matmul": "block_fp8_gemm",
      "gemm|matmul|nvjet": "gemm",
      "fused_moe_kernel": "moe_gemm",
      "moe|expert|sigmoid": "moe",
      "CatArrayBatched|write_req_to": "prepare_next",
      "ncclDevKernel|cross_device_reduce|all_gather": "nccl_and_custom_ar",
      "Norm": "norm",
      "topk": "topk",
      "activation|act_and_mul": "activation",
      "compute_position_kernel": "rope",
      "elementwise": "elementwise",
      "fp8_quant|quant_fp8|quantize": "quantize",
      "SoftMax": "softmax",
      "reduce": "reduce",
      "_fwd_|create_flash|::mla::|KVCache": "attn",
      "CUDA mem": "non-gpu-H_D_memops",
      ".*": "misc"
    },
    "gpt-oss": {
      "gemm|nvjet": "gemm",
      "fused_moe_kernel|_group_gemm|GroupProblemShape|GemmUniversal|bmm_|matmul_ogs_|_topk_forward|_combined_routing|_sum_bitmatrix_rows|_compute_writeback_idx": "moe_gemm",
      "moe|sigmoid": "moe",
      "CatArrayBatched|prepare_inputs": "prepare_next",
      "_norm_|Norm": "norm",
      "ncclDevKernel|cross_device_reduce|allreduce": "nccl_and_custom_ar",
      "topk|TopK": "topk",
      "act_and_mul_": "activation",
      "Rotary": "rope",
      "SoftMax": "softmax",
      "flash|fmha": "attn",
      "elementwise": "elementwise",
      "fp8_quant|cvt_|quantize": "quantize",
      "reduce_kernel": "reduce",
      "triton": "triton_kernel",
      "CUDA mem": "non-gpu-H_D_memops",
      ".*": "misc"
    }
  }
}