{ "schema_version": 1, "description": "Complementary kernel knowledge map for Humanize-driven GPU kernel optimization. Lists code and knowledge repositories that have no curated PR diffs in the local Route A corpus (NVIDIA developer samples, Colfax research kernels, simveit micro-tutorials, Hygon/DCU optimization references, and open Triton kernel libraries such as AITER, AOTriton, Conch, FlagGems, Liger Kernel, Hugging Face kernels, and Triton-distributed); each framework entry points to upstream repos, kernel directories, and source guides. Topic entries map kernel topics to per-framework references for live clone/grep workflows. Frameworks already covered by Route A PR bundles (SGLang, vLLM, TensorRT-LLM, PyTorch, FlashAttention, FlashInfer, CUTLASS/CuTe, CCCL, Triton, DeepGEMM, ThunderKittens, TileLang, QuACK, DeepSeek TileKernels) are intentionally excluded.", "frameworks": [ { "id": "nvidia-code-samples", "name": "NVIDIA Developer Code Samples", "repo": "NVIDIA-developer-blog/code-samples", "url": "https://github.com/NVIDIA-developer-blog/code-samples", "kernel_paths": [ "posts", "Samples", "src", "README.md" ], "tags": [ "cuda-samples", "gemm", "transpose", "coalescing", "shared-memory", "bank-conflicts", "reduction", "occupancy" ] }, { "id": "simveit-effective-transpose", "name": "simveit effective_transpose", "repo": "simveit/effective_transpose", "url": "https://github.com/simveit/effective_transpose", "kernel_paths": [ ".", "src", "examples", "README.md" ], "tags": [ "cute-dsl", "quack", "gemm", "tma", "wgmma", "swizzle", "transpose", "rmsnorm", "block-scaled", "nvfp4", "gdn", "hopper", "blackwell" ] }, { "id": "simveit-load-and-store", "name": "simveit load_and_store", "repo": "simveit/load_and_store", "url": "https://github.com/simveit/load_and_store", "kernel_paths": [ ".", "src", "examples", "README.md" ], "tags": [ "cute-dsl", "load-store", "gemm", "tma", "wgmma", "shared-memory", "hopper", "blackwell" ] }, { "id": "colfax-article-src", "name": "Colfax article source kernels", "repo": "ColfaxResearch/cfx-article-src", "url": "https://github.com/ColfaxResearch/cfx-article-src", "kernel_paths": [ "tma", "pipeline-gemm", "streamk", "transpose-cute", "cutlass_gemm" ], "tags": [ "cutlass", "cute", "cute-dsl", "gemm", "attention", "tma", "wgmma", "stream-k", "persistent", "transpose", "block-scaled", "blackwell", "hopper" ] }, { "id": "colfax-cutlass-kernels", "name": "Colfax CUTLASS kernels", "repo": "ColfaxResearch/cutlass-kernels", "url": "https://github.com/ColfaxResearch/cutlass-kernels", "kernel_paths": [ "src", "lib/gemm", "include", "examples", "README.md" ], "tags": [ "cutlass", "cute", "gemm", "tma", "wgmma", "stream-k", "persistent", "blackwell", "hopper" ] }, { "id": "hygon-hip-kernel-optimizer", "name": "Hygon HIP Kernel Optimizer Skill", "repo": "yuguo-Jack/cuda-optimized-skill", "url": "https://github.com/yuguo-Jack/cuda-optimized-skill", "kernel_paths": [ "skills/hygon-hip-kernel-optimizer/SKILL.md", "skills/hygon-hip-kernel-optimizer/references/optimization_catalog.md", "skills/hygon-hip-kernel-optimizer/references/dcu_metrics_guide.md", "skills/hygon-hip-kernel-optimizer/references/method_registry.json", "skills/hygon-hip-kernel-optimizer/references/dcu_isa_signatures.json", "skills/hygon-hip-kernel-optimizer/examples/walkthrough.md" ], "tags": [ "dcu", "hygon", "hip", "rocm", "ck-tile", "mmac", "wave64", "hipprof", "dccobjdump", "sqtt" ] }, { "id": "rocm-aiter", "name": "AITER AI Tensor Engine for ROCm", "repo": "ROCm/aiter", "url": "https://github.com/ROCm/aiter", "kernel_paths": [ "aiter", "op_tests", "docs", "gradlib", "csrc", "requirements-triton-comms.txt", ".github/scripts/install_triton.sh", "README.md" ], "tags": [ "triton", "rocm", "aiter", "attention", "mla", "paged-attention", "fused-moe", "gemm", "rmsnorm", "quantization", "communication" ] }, { "id": "rocm-aotriton", "name": "AOTriton Ahead-of-Time Triton Math Library", "repo": "ROCm/aotriton", "url": "https://github.com/ROCm/aotriton", "kernel_paths": [ "v2python", "v2src", "v3python", "v3src", "tritonsrc", "include/aotriton", "test", "docs", "README.md" ], "tags": [ "triton", "rocm", "aot", "aotriton", "flash-attention", "sdpa", "attention", "compiler", "codegen" ] }, { "id": "stackav-conch", "name": "Conch Triton Kernel Standard Library", "repo": "stackav-oss/conch", "url": "https://github.com/stackav-oss/conch", "kernel_paths": [ "conch", "tests", "benchmarks", "README.md", "pyproject.toml" ], "tags": [ "triton", "rocm", "standard-library", "paged-attention", "varlen-attention", "rmsnorm", "rotary", "kv-cache", "fp8", "int8", "quantization", "vllm" ] }, { "id": "flaggems", "name": "FlagGems Triton Operator Library", "repo": "flagos-ai/FlagGems", "url": "https://github.com/flagos-ai/FlagGems", "kernel_paths": [ "src/flag_gems", "benchmark", "tests", "modules_tests", "experimental_tests", "triton_src", "docs", "README.md" ], "tags": [ "triton", "pytorch", "operator-library", "llm", "backend-neutral", "multi-backend", "aten", "normalization", "reduction", "elementwise", "quantization" ] }, { "id": "liger-kernel", "name": "Liger Kernel Triton Kernels for LLM Training", "repo": "linkedin/Liger-Kernel", "url": "https://github.com/linkedin/Liger-Kernel", "kernel_paths": [ "src/liger_kernel", "test", "benchmark", "examples", "docs", "README.md" ], "tags": [ "triton", "llm-training", "rmsnorm", "rope", "swiglu", "cross-entropy", "fused-linear-cross-entropy", "loss", "amd" ] }, { "id": "huggingface-kernels", "name": "Hugging Face Kernels and kernels-community Hub", "repo": "huggingface/kernels", "url": "https://github.com/huggingface/kernels", "kernel_paths": [ "src", "kernel-builder", "tests", "README.md", "https://huggingface.co/kernels", "https://huggingface.co/kernels-community" ], "tags": [ "triton", "kernel-hub", "kernels-community", "paged-attention", "triton-moe", "triton-scaled-mm", "rmsnorm", "rotary", "quantization" ] }, { "id": "triton-distributed", "name": "Triton-distributed", "repo": "ByteDance-Seed/Triton-distributed", "url": "https://github.com/ByteDance-Seed/Triton-distributed", "kernel_paths": [ "python", "lib", "include", "csrc", "docs", "tests", "README.md" ], "tags": [ "triton", "distributed", "communication-overlap", "gemm-allreduce", "allgather-gemm", "reduce-scatter", "moe", "flash-decode", "amd", "nvidia" ] } ], "topics": [ { "id": "attention", "name": "Attention / FMHA / Paged", "applies_to": [ "simveit-effective-transpose", "simveit-load-and-store", "colfax-article-src", "colfax-cutlass-kernels", "hygon-hip-kernel-optimizer", "rocm-aiter", "rocm-aotriton", "stackav-conch", "huggingface-kernels", "triton-distributed" ], "tags": [ "attention", "fmha", "paged", "kv-cache", "softmax", "online" ] }, { "id": "matmul-gemm", "name": "GEMM / Tensor-Core matmul", "applies_to": [ "nvidia-code-samples", "simveit-effective-transpose", "simveit-load-and-store", "colfax-article-src", "colfax-cutlass-kernels", "hygon-hip-kernel-optimizer", "rocm-aiter", "stackav-conch", "flaggems", "huggingface-kernels", "triton-distributed" ], "tags": [ "gemm", "matmul", "tensor-core", "wgmma", "tma", "epilogue" ] }, { "id": "moe", "name": "Mixture of Experts", "applies_to": [ "simveit-effective-transpose", "simveit-load-and-store", "colfax-article-src", "colfax-cutlass-kernels", "hygon-hip-kernel-optimizer", "rocm-aiter", "huggingface-kernels", "triton-distributed" ], "tags": [ "moe", "fused-moe", "grouped-gemm", "align-block-size", "top-k", "permute" ] }, { "id": "normalization", "name": "RMSNorm / LayerNorm fused norms", "applies_to": [ "nvidia-code-samples", "simveit-effective-transpose", "hygon-hip-kernel-optimizer", "rocm-aiter", "stackav-conch", "flaggems", "liger-kernel", "huggingface-kernels" ], "tags": [ "rmsnorm", "layernorm", "fused-add-norm", "qk-norm", "qknorm" ] }, { "id": "activation-fusion", "name": "Activation / element-wise fusion", "applies_to": [ "simveit-effective-transpose", "hygon-hip-kernel-optimizer", "rocm-aiter", "stackav-conch", "flaggems", "liger-kernel", "huggingface-kernels" ], "tags": [ "silu", "gelu", "swiglu", "fused-add", "fused-residual" ] }, { "id": "quantization-fp8", "name": "FP8 / FP4 / INT8 quantization", "applies_to": [ "simveit-effective-transpose", "simveit-load-and-store", "colfax-article-src", "colfax-cutlass-kernels", "hygon-hip-kernel-optimizer", "rocm-aiter", "stackav-conch", "flaggems", "huggingface-kernels" ], "tags": [ "fp8", "fp4", "int8", "awq", "gptq", "block-scaled", "per-tensor", "per-channel" ] }, { "id": "triton-open-kernel-libraries", "name": "Open Triton kernel libraries", "applies_to": [ "rocm-aiter", "rocm-aotriton", "stackav-conch", "flaggems", "liger-kernel", "huggingface-kernels", "triton-distributed" ], "tags": [ "triton", "open-source", "llm", "dcu", "rocm", "benchmark", "unit-test", "reference-implementation" ] }, { "id": "distributed-triton", "name": "Distributed Triton and compute-communication overlap", "applies_to": [ "rocm-aiter", "triton-distributed" ], "tags": [ "triton", "distributed", "communication", "allreduce", "allgather", "reduce-scatter", "moe", "tensor-parallel", "expert-parallel" ] } ] }