{ "schema_version": 1, "description": "Complementary kernel knowledge map for Humanize-driven GPU kernel optimization. Lists code and knowledge repositories that have no curated PR diffs in the local Route A corpus (NVIDIA developer samples, Colfax research kernels, simveit micro-tutorials, Hygon/DCU optimization references); each framework entry points to upstream repos, kernel directories, and source guides. Topic entries map kernel topics to per-framework references for live clone/grep workflows. Frameworks already covered by Route A PR bundles (SGLang, vLLM, TensorRT-LLM, PyTorch, FlashAttention, FlashInfer, CUTLASS/CuTe, CCCL, Triton, DeepGEMM, ThunderKittens, TileLang, QuACK, DeepSeek TileKernels) are intentionally excluded.", "frameworks": [ { "id": "nvidia-code-samples", "name": "NVIDIA Developer Code Samples", "repo": "NVIDIA-developer-blog/code-samples", "url": "https://github.com/NVIDIA-developer-blog/code-samples", "kernel_paths": [ "posts", "Samples", "src", "README.md" ], "tags": [ "cuda-samples", "gemm", "transpose", "coalescing", "shared-memory", "bank-conflicts", "reduction", "occupancy" ] }, { "id": "simveit-effective-transpose", "name": "simveit effective_transpose", "repo": "simveit/effective_transpose", "url": "https://github.com/simveit/effective_transpose", "kernel_paths": [ ".", "src", "examples", "README.md" ], "tags": [ "cute-dsl", "quack", "gemm", "tma", "wgmma", "swizzle", "transpose", "rmsnorm", "block-scaled", "nvfp4", "gdn", "hopper", "blackwell" ] }, { "id": "simveit-load-and-store", "name": "simveit load_and_store", "repo": "simveit/load_and_store", "url": "https://github.com/simveit/load_and_store", "kernel_paths": [ ".", "src", "examples", "README.md" ], "tags": [ "cute-dsl", "load-store", "gemm", "tma", "wgmma", "shared-memory", "hopper", "blackwell" ] }, { "id": "colfax-article-src", "name": "Colfax article source kernels", "repo": "ColfaxResearch/cfx-article-src", "url": "https://github.com/ColfaxResearch/cfx-article-src", "kernel_paths": [ "tma", "pipeline-gemm", "streamk", "transpose-cute", "cutlass_gemm" ], "tags": [ "cutlass", "cute", "cute-dsl", "gemm", "attention", "tma", "wgmma", "stream-k", "persistent", "transpose", "block-scaled", "blackwell", "hopper" ] }, { "id": "colfax-cutlass-kernels", "name": "Colfax CUTLASS kernels", "repo": "ColfaxResearch/cutlass-kernels", "url": "https://github.com/ColfaxResearch/cutlass-kernels", "kernel_paths": [ "src", "lib/gemm", "include", "examples", "README.md" ], "tags": [ "cutlass", "cute", "gemm", "tma", "wgmma", "stream-k", "persistent", "blackwell", "hopper" ] }, { "id": "hygon-hip-kernel-optimizer", "name": "Hygon HIP Kernel Optimizer Skill", "repo": "yuguo-Jack/cuda-optimized-skill", "url": "https://github.com/yuguo-Jack/cuda-optimized-skill", "kernel_paths": [ "skills/hygon-hip-kernel-optimizer/SKILL.md", "skills/hygon-hip-kernel-optimizer/references/optimization_catalog.md", "skills/hygon-hip-kernel-optimizer/references/dcu_metrics_guide.md", "skills/hygon-hip-kernel-optimizer/references/method_registry.json", "skills/hygon-hip-kernel-optimizer/references/dcu_isa_signatures.json", "skills/hygon-hip-kernel-optimizer/examples/walkthrough.md" ], "tags": [ "dcu", "hygon", "hip", "rocm", "ck-tile", "mmac", "wave64", "hipprof", "dccobjdump", "sqtt" ] } ], "topics": [ { "id": "attention", "name": "Attention / FMHA / Paged", "applies_to": [ "simveit-effective-transpose", "simveit-load-and-store", "colfax-article-src", "colfax-cutlass-kernels", "hygon-hip-kernel-optimizer" ], "tags": [ "attention", "fmha", "paged", "kv-cache", "softmax", "online" ] }, { "id": "matmul-gemm", "name": "GEMM / Tensor-Core matmul", "applies_to": [ "nvidia-code-samples", "simveit-effective-transpose", "simveit-load-and-store", "colfax-article-src", "colfax-cutlass-kernels", "hygon-hip-kernel-optimizer" ], "tags": [ "gemm", "matmul", "tensor-core", "wgmma", "tma", "epilogue" ] }, { "id": "moe", "name": "Mixture of Experts", "applies_to": [ "simveit-effective-transpose", "simveit-load-and-store", "colfax-article-src", "colfax-cutlass-kernels", "hygon-hip-kernel-optimizer" ], "tags": [ "moe", "fused-moe", "grouped-gemm", "align-block-size", "top-k", "permute" ] }, { "id": "normalization", "name": "RMSNorm / LayerNorm fused norms", "applies_to": [ "nvidia-code-samples", "simveit-effective-transpose", "hygon-hip-kernel-optimizer" ], "tags": [ "rmsnorm", "layernorm", "fused-add-norm", "qk-norm", "qknorm" ] }, { "id": "activation-fusion", "name": "Activation / element-wise fusion", "applies_to": [ "simveit-effective-transpose", "hygon-hip-kernel-optimizer" ], "tags": [ "silu", "gelu", "swiglu", "fused-add", "fused-residual" ] }, { "id": "quantization-fp8", "name": "FP8 / FP4 / INT8 quantization", "applies_to": [ "simveit-effective-transpose", "simveit-load-and-store", "colfax-article-src", "colfax-cutlass-kernels", "hygon-hip-kernel-optimizer" ], "tags": [ "fp8", "fp4", "int8", "awq", "gptq", "block-scaled", "per-tensor", "per-channel" ] } ] }