Add fp8 inference

9c26b762 · chenych · f229f29b · 9c26b762 · 9c26b762 · 9c26b762
Commit 9c26b762 authored Apr 25, 2026 by chenych
15 changed files
--- a/README.md
+++ b/README.md
@@ -15,12 +15,11 @@ DeepSeek-V4 系列在架构与优化方面引入了多项关键升级：
 | 软件 |   版本    |
 | :------: |:-------:|
 |     DTK      |  26.04  |
-|    python    | 3.10.12 |
-|    torch     |  2.9.0+das.opt1.dtk2604.20260331.g4e3c1e7  |
-|    tilelang     |  0.1.7.post3+cpu.git52700923  |
+|    Python    | 3.10.12 |
+|    Torch     |  2.9.0+das.opt1.dtk2604.20260331.g4e3c1e7  |
+|   Tilelang   |  0.1.7.post3+cpu.git52700923  |

-
-当前仅支持镜像:harbor.sourcefind.cn:5443/dcu/admin/base/custom:torch-2.9.0-ubuntu22.04-dtk26.04-deepseek-v4-0425
+当前仅支持镜像: harbor.sourcefind.cn:5443/dcu/admin/base/custom:torch-2.9.0-ubuntu22.04-dtk26.04-deepseek-v4-0425

 - 挂载地址`-v`根据实际模型情况修改

@@ -44,8 +43,6 @@ docker run -it \

 更多镜像可前往[光源](https://sourcefind.cn/#/service-list)下载使用。

-关于本项目DCU显卡所需的特殊深度学习库可从[光合](https://developer.sourcefind.cn/tool/)开发者社区下载安装。
-
 ## 数据集
 `暂无`

@@ -55,11 +52,11 @@ docker run -it \
 ## 推理
 ### Pytorch
 #### 单机推理
+##### BF16
 1. 模型转换与切分
-
 ```bash
 #注意将脚本中对应的路径及参数设置成用户实际值
-#其中：INPUT_FP8_HF_PATH为模型下载路径；OUTPUT_BF16_HF_PATH为bf16模型存放路径；SAVE_PATH为切分好的模型路径；mp根据实际卡数调整
+#其中：INPUT_FP8_HF_PATH为原始模型路径；OUTPUT_BF16_HF_PATH为bf16模型存放路径；SAVE_PATH为切分好的模型路径；mp根据实际卡数调整
 cd convert_weight
 bash convert_weight.sh
 ```
@@ -67,10 +64,25 @@ bash convert_weight.sh
 2. 启动对话推理
 ```bash
 #注意将脚本中对应的路径及参数设置成用户实际值
-cd ../inference
+cd ../inference-bf16
 sh start_torch.sh
 ```

+##### FP8
+1. 模型转换与切分
+```bash
+#注意将脚本中对应的路径及参数设置成用户实际值
+#其中：--hf-ckpt-path为原始模型路径；--save-path为切分好的FP8模型的存放路径；MP根据实际卡数调整（默认为8）
+cd inference-fp8
+bash cast_fp4_to_fp8.sh
+```
+
+2. 启动对话推理
+```bash
+#注意将脚本中对应的路径及参数设置成用户实际值
+sh start_torch_fp8.sh
+```
+
 ## 效果展示

 **注意**：首次对话时由于kernel编译，可能会出现dtk hipcc编译警告，属于正常现象

--- a/inference/encoding_dsv4.py
+++ b/inference/encoding_dsv4.py
--- a/inference/config.json
+++ b/inference/config.json
--- a/inference/generate.py
+++ b/inference/generate.py
--- a/inference/kernel.py
+++ b/inference/kernel.py
--- a/inference/model.py
+++ b/inference/model.py
--- a/inference/start_torch.sh
+++ b/inference/start_torch.sh
--- a/inference-fp8/cast_fp4_to_fp8.sh
+++ b/inference-fp8/cast_fp4_to_fp8.sh
+#!/bin/bash
+export EXPERTS=256
+export MP=8
+
+python convert.py --hf-ckpt-path deepseek-ai/DeepSeek-V4-Flash  \
+    --save-path /path/of/DeepSeek-V4-Flash-FP8-MP8 \
+    --n-experts ${EXPERTS} \
+    --model-parallel ${MP} \
+    --expert-dtype fp8
--- a/inference-fp8/config.json
+++ b/inference-fp8/config.json
+{
+    "vocab_size": 129280,
+    "dim": 4096,
+    "moe_inter_dim": 2048,
+    "n_layers": 43,
+    "n_hash_layers": 3,
+    "n_heads": 64,
+    "n_routed_experts": 256,
+    "n_shared_experts": 1,
+    "n_activated_experts": 6,
+    "score_func": "sqrtsoftplus",
+    "route_scale": 1.5,
+    "swiglu_limit": 10.0,
+    "q_lora_rank": 1024,
+    "head_dim": 512,
+    "rope_head_dim": 64,
+    "o_groups": 8,
+    "o_lora_rank": 1024,
+    "window_size": 128,
+    "original_seq_len": 65536,
+    "rope_theta": 10000,
+    "rope_factor": 16,
+    "beta_fast": 32,
+    "beta_slow": 1,
+    "index_n_heads": 64,
+    "index_head_dim": 128,
+    "index_topk": 512,
+    "hc_mult": 4,
+    "hc_sinkhorn_iters": 20,
+    "dtype": "fp8",
+    "scale_fmt": "ue8m0",
+    "compress_rope_theta": 160000,
+    "compress_ratios": [0, 0, 4, 128, 4, 128, 4, 128, 4, 128, 4, 128, 4, 128, 4, 128, 4, 128, 4, 128, 4, 128, 4, 128, 4, 128, 4, 128, 4, 128, 4, 128, 4, 128, 4, 128, 4, 128, 4, 128, 4, 128, 4, 0]
+}
\ No newline at end of file
--- a/inference-fp8/convert.py
+++ b/inference-fp8/convert.py
+import os
+import shutil
+from argparse import ArgumentParser
+from glob import glob
+from tqdm import tqdm, trange
+
+import torch
+from safetensors.torch import safe_open, save_file
+
+
+FP4_TABLE = torch.tensor([
+    0.0, 0.5, 1.0, 1.5, 2.0, 3.0, 4.0, 6.0,
+    0.0, -0.5, -1.0, -1.5, -2.0, -3.0, -4.0, -6.0
+], dtype=torch.float32)
+
+
+def cast_e2m1fn_to_e4m3fn(x: torch.Tensor, scale: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
+    """
+    Casts a tensor from e2m1fn to e4m3fn losslessly.
+    """
+    assert x.dtype == torch.int8
+    assert x.ndim == 2
+    out_dim, in_dim = x.size()
+    in_dim *= 2
+    fp8_block_size = 128
+    fp4_block_size = 32
+    assert in_dim % fp8_block_size == 0 and out_dim % fp8_block_size == 0
+    assert scale.size(0) == out_dim and scale.size(1) == in_dim // fp4_block_size
+
+    x = x.view(torch.uint8)
+    low  = x & 0x0F
+    high = (x >> 4) & 0x0F
+    x = torch.stack([FP4_TABLE[low.long()], FP4_TABLE[high.long()]], dim=-1).flatten(2)
+
+    # max_fp4 (6.0) * MAX_OFFSET must fit in e4m3fn (max 448)
+    # 6.0 * 2^6 = 384 < 448; 6.0 * 2^7 = 768 > 448; so MAX_OFFSET_BITS = 6
+    MAX_OFFSET_BITS = 6
+
+    bOut = out_dim // fp8_block_size
+    bIn = in_dim // fp8_block_size
+    # bOut, bIn, 128, 128
+    x = x.view(bOut, fp8_block_size, bIn, fp8_block_size).transpose(1, 2)
+    # bOut, bIn, 128*4
+    scale = scale.float().view(bOut, fp8_block_size, bIn, -1).transpose(1, 2).flatten(2)
+    ## bOut, bIn, 1
+    scale_max_offset_bits = scale.amax(dim=-1, keepdim=True) / (2**MAX_OFFSET_BITS)
+    # bOut, bIn, 128*4
+    offset = scale / scale_max_offset_bits
+    # bOut, bIn, 128, 128
+    offset = offset.unflatten(-1, (fp8_block_size, -1)).repeat_interleave(fp4_block_size, dim=-1)
+    x = (x * offset).transpose(1, 2).reshape(out_dim, in_dim)
+    return x.to(torch.float8_e4m3fn), scale_max_offset_bits.squeeze(-1).to(torch.float8_e8m0fnu)
+
+
+mapping = {
+    "embed_tokens": ("embed", 0),
+    "input_layernorm": ("attn_norm", None),
+    "post_attention_layernorm": ("ffn_norm", None),
+    "q_proj": ("wq", 0),
+    "q_a_proj": ("wq_a", None),
+    "q_a_layernorm": ("q_norm", None),
+    "q_b_proj": ("wq_b", 0),
+    "kv_a_proj_with_mqa": ("wkv_a", None),
+    "kv_a_layernorm": ("kv_norm", None),
+    "kv_b_proj": ("wkv_b", 0),
+    "o_proj": ("wo", 1),
+    "gate_proj": ("w1", 0),
+    "down_proj": ("w2", 1),
+    "up_proj": ("w3", 0),
+    "lm_head": ("head", 0),
+
+    "embed": ("embed", 0),
+    "wq_b": ("wq_b", 0),
+    "wo_a": ("wo_a", 0),
+    "wo_b": ("wo_b", 1),
+    "head": ("head", 0),
+    "attn_sink": ("attn_sink", 0),
+    "weights_proj": ("weights_proj", 0),
+}
+
+
+def main(hf_ckpt_path, save_path, n_experts, mp, expert_dtype):
+    """
+    Converts and saves model checkpoint files into a specified format.
+
+    Args:
+        hf_ckpt_path (str): Path to the directory containing the input checkpoint files.
+        save_path (str): Path to the directory where the converted checkpoint files will be saved.
+        n_experts (int): Total number of experts in the model.
+        mp (int): Model parallelism factor.
+        
+    Returns:
+        None
+    """
+    torch.set_num_threads(8)
+    n_local_experts = n_experts // mp
+    state_dicts = [{} for _ in range(mp)]
+
+    for file_path in tqdm(glob(os.path.join(hf_ckpt_path, "*.safetensors"))):
+        with safe_open(file_path, framework="pt", device="cpu") as f:
+            for name in f.keys():
+                param: torch.Tensor = f.get_tensor(name)
+                if name.startswith("model."):
+                    name = name[len("model."):]
+                if name.startswith("mtp.") and ("emb" in name or name.endswith("head.weight")):
+                    continue
+                name = name.replace("self_attn", "attn")
+                name = name.replace("mlp", "ffn")
+                name = name.replace("weight_scale_inv", "scale")
+                name = name.replace("e_score_correction_bias", "bias")
+                if any(x in name for x in ["hc", "attn_sink", "tie2eid", "ape"]):    # without .weight
+                    key = name.split(".")[-1]
+                else:
+                    key = name.split(".")[-2]
+                if key in mapping:
+                    new_key, dim = mapping[key]
+                else:
+                    new_key, dim = key, None
+                name = name.replace(key, new_key)
+                for i in range(mp):
+                    new_param = param
+                    if "experts" in name and "shared_experts" not in name:
+                        idx = int(name.split(".")[-3])
+                        if idx < i * n_local_experts or idx >= (i + 1) * n_local_experts:
+                            continue
+                    elif dim is not None:
+                        assert param.size(dim) % mp == 0, f"Dimension {dim} must be divisible by {mp}"
+                        shard_size = param.size(dim) // mp
+                        new_param = param.narrow(dim, i * shard_size, shard_size).contiguous()
+                    state_dicts[i][name] = new_param
+
+    os.makedirs(save_path, exist_ok=True)
+
+    for i in trange(mp):
+        names = list(state_dicts[i].keys())
+        for name in names:
+            if name.endswith("wo_a.weight"):
+                weight = state_dicts[i][name]
+                scale = state_dicts[i].pop(name.replace("weight", "scale"))
+                weight = weight.unflatten(0, (-1, 128)).unflatten(-1, (-1, 128)).float() * scale[:, None, :, None].float()
+                state_dicts[i][name] = weight.flatten(2, 3).flatten(0, 1).bfloat16()
+            elif "experts" in name and state_dicts[i][name].dtype == torch.int8:
+                if expert_dtype == "fp8":
+                    scale_name = name.replace("weight", "scale")
+                    weight = state_dicts[i].pop(name)
+                    scale = state_dicts[i].pop(scale_name)
+                    state_dicts[i][name], state_dicts[i][scale_name] = cast_e2m1fn_to_e4m3fn(weight, scale)
+                else:
+                    state_dicts[i][name] = state_dicts[i][name].view(torch.float4_e2m1fn_x2)
+        save_file(state_dicts[i], os.path.join(save_path, f"model{i}-mp{mp}.safetensors"))
+
+    for file in ["tokenizer.json", "tokenizer_config.json"]:
+        old_file_path = os.path.join(hf_ckpt_path, file)
+        new_file_path = os.path.join(save_path, file)
+        if os.path.exists(old_file_path):
+            shutil.copyfile(old_file_path, new_file_path)
+
+
+if __name__ == "__main__":
+    parser = ArgumentParser()
+    parser.add_argument("--hf-ckpt-path", type=str, required=True)
+    parser.add_argument("--save-path", type=str, required=True)
+    parser.add_argument("--n-experts", type=int, required=True)
+    parser.add_argument("--model-parallel", type=int, required=True)
+    parser.add_argument("--expert-dtype", type=str, choices=["fp8", "fp4"], required=False, default=None)
+    args = parser.parse_args()
+    assert args.n_experts % args.model_parallel == 0, "Number of experts must be divisible by model parallelism"
+    main(args.hf_ckpt_path, args.save_path, args.n_experts, args.model_parallel, args.expert_dtype)
--- a/inference-fp8/generate.py
+++ b/inference-fp8/generate.py
+import os
+import json
+import sys
+from argparse import ArgumentParser
+from typing import List
+
+import torch
+import torch.distributed as dist
+from transformers import AutoTokenizer
+from safetensors.torch import load_model
+
+from model import Transformer, ModelArgs
+from encoding_dsv4 import encode_messages, parse_message_from_completion_text
+
+
+def sample(logits, temperature: float = 1.0):
+    """Gumbel-max trick: equivalent to multinomial sampling but faster on GPU,
+    since it avoids the GPU-to-CPU sync in torch.multinomial."""
+    logits = logits / max(temperature, 1e-5)
+    probs = torch.softmax(logits, dim=-1, dtype=torch.float32)
+    return probs.div_(torch.empty_like(probs).exponential_(1)).argmax(dim=-1)
+
+
+@torch.inference_mode()
+def generate(
+    model: Transformer,
+    prompt_tokens: List[List[int]],
+    max_new_tokens: int,
+    eos_id: int,
+    temperature: float = 1.0
+) -> List[List[int]]:
+    """Batch generation with left-padded prompts.
+
+    The first forward pass processes [min_prompt_len:] tokens (prefill phase).
+    Subsequent passes generate one token at a time (decode phase). For positions
+    still within a prompt, the ground-truth token overrides the model's prediction.
+    """
+    prompt_lens = [len(t) for t in prompt_tokens]
+    assert max(prompt_lens) <= model.max_seq_len, f"Prompt length exceeds model maximum sequence length (max_seq_len={model.max_seq_len})"
+    total_len = min(model.max_seq_len, max_new_tokens + max(prompt_lens))
+    tokens = torch.full((len(prompt_tokens), total_len), -1, dtype=torch.long)
+    for i, t in enumerate(prompt_tokens):
+        tokens[i, :len(t)] = torch.tensor(t, dtype=torch.long)
+    prev_pos = 0
+    finished = torch.tensor([False] * len(prompt_tokens))
+    prompt_mask = tokens != -1
+    for cur_pos in range(min(prompt_lens), total_len):
+        logits = model.forward(tokens[:, prev_pos:cur_pos], prev_pos)
+        if temperature > 0:
+            next_token = sample(logits, temperature)
+        else:
+            next_token = logits.argmax(dim=-1)
+        next_token = torch.where(prompt_mask[:, cur_pos], tokens[:, cur_pos], next_token)
+        tokens[:, cur_pos] = next_token
+        finished |= torch.logical_and(~prompt_mask[:, cur_pos], next_token == eos_id)
+        prev_pos = cur_pos
+        if finished.all():
+            break
+    completion_tokens = []
+    for i, toks in enumerate(tokens.tolist()):
+        toks = toks[prompt_lens[i]:prompt_lens[i]+max_new_tokens]
+        if eos_id in toks:
+            toks = toks[:toks.index(eos_id)]
+        toks.append(eos_id)
+        completion_tokens.append(toks)
+    return completion_tokens
+
+
+def main(
+    ckpt_path: str,
+    config: str,
+    input_file: str = "",
+    interactive: bool = True,
+    max_new_tokens: int = 100,
+    temperature: float = 1.0,
+) -> None:
+    world_size = int(os.getenv("WORLD_SIZE", "1"))
+    rank = int(os.getenv("RANK", "0"))
+    local_rank = int(os.getenv("LOCAL_RANK", "0"))
+    if world_size > 1:
+        dist.init_process_group("nccl")
+    global print
+    if rank != 0:
+        print = lambda *_, **__: None
+    torch.cuda.set_device(local_rank)
+    torch.cuda.memory._set_allocator_settings("expandable_segments:True")
+    torch.set_default_dtype(torch.bfloat16)
+    torch.set_num_threads(8)
+    torch.manual_seed(33377335)
+    with open(config) as f:
+        args = ModelArgs(**json.load(f))
+    if interactive:
+        args.max_batch_size = 1
+    print(args)
+    with torch.device("cuda"):
+        model = Transformer(args)
+    tokenizer = AutoTokenizer.from_pretrained(ckpt_path)
+    print("load model")
+    load_model(model, os.path.join(ckpt_path, f"model{rank}-mp{world_size}.safetensors"), strict=False)
+    torch.set_default_device("cuda")
+    print("I'm DeepSeek 👋")
+
+    if interactive:
+        messages = []
+        while True:
+            if world_size == 1:
+                prompt = input(">>> ")
+            elif rank == 0:
+                prompt = input(">>> ")
+                objects = [prompt]
+                dist.broadcast_object_list(objects, 0)
+            else:
+                objects = [None]
+                dist.broadcast_object_list(objects, 0)
+                prompt = objects[0]
+            if prompt == "/exit":
+                break
+            elif prompt == "/clear":
+                messages.clear()
+                continue
+            messages.append({"role": "user", "content": prompt})
+            prompt_tokens = tokenizer.encode(encode_messages(messages, thinking_mode="chat"))
+            completion_tokens = generate(model, [prompt_tokens], max_new_tokens, tokenizer.eos_token_id, temperature)
+            completion = tokenizer.decode(completion_tokens[0])
+            print(completion)
+            messages.append(parse_message_from_completion_text(completion, thinking_mode="chat"))
+    else:
+        with open(input_file) as f:
+            prompts = f.read().split("\n\n")
+        prompt_tokens = [tokenizer.encode(encode_messages([{"role": "user", "content": prompt}], thinking_mode="chat")) for prompt in prompts]
+        completion_tokens = generate(model, prompt_tokens, max_new_tokens, tokenizer.eos_token_id, temperature)
+        completions = tokenizer.batch_decode(completion_tokens)
+        for prompt, completion in zip(prompts, completions):
+            print("Prompt:", prompt)
+            print("Completion:", completion)
+            print()
+
+    if world_size > 1:
+        dist.destroy_process_group()
+
+
+if __name__ == "__main__":
+    parser = ArgumentParser()
+    parser.add_argument("--ckpt-path", type=str, required=True)
+    parser.add_argument("--config", type=str, required=True)
+    parser.add_argument("--input-file", type=str, default="")
+    parser.add_argument("--interactive", action="store_true")
+    parser.add_argument("--max-new-tokens", type=int, default=300)
+    parser.add_argument("--temperature", type=float, default=0.6)
+    args = parser.parse_args()
+    assert args.input_file or args.interactive, "Either input-file or interactive mode must be specified"
+    main(args.ckpt_path, args.config, args.input_file, args.interactive, args.max_new_tokens, args.temperature)
--- a/inference-fp8/kernel.py
+++ b/inference-fp8/kernel.py
+import torch
+import tilelang
+import tilelang.language as T
+from typing import Tuple, Optional
+
+
+tilelang.set_log_level("WARNING")
+
+pass_configs = {
+    tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
+    tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
+    tilelang.PassConfigKey.TL_ENABLE_AGGRESSIVE_SHARED_MEMORY_MERGE: True,
+}
+
+FP8 = "float8_e4m3"
+FP4 = "float4_e2m1fn"
+# FE8M0 = "float8_e8m0fnu"
+BF16 = "bfloat16"
+FP32 = "float32"
+INT32 = "int32"
+
+
+def fast_log2_ceil(x):
+    """Compute ceil(log2(x)) via IEEE 754 bit manipulation. Avoids slow log/ceil intrinsics."""
+    bits_x = T.reinterpret("uint32", x)
+    exp_x = (bits_x >> 23) & 0xFF
+    man_bits = bits_x & ((1 << 23) - 1)
+    return T.Cast("int32", exp_x - 127 + T.if_then_else(man_bits != 0, 1, 0))
+
+
+def fast_pow2(x):
+    """Compute 2^x for integer x via IEEE 754 bit manipulation."""
+    bits_x = (x + 127) << 23
+    return T.reinterpret("float32", bits_x)
+
+
+def fast_round_scale(amax, fp8_max_inv):
+    return fast_pow2(fast_log2_ceil(amax * fp8_max_inv))
+
+
+@tilelang.jit(pass_configs=pass_configs)
+def act_quant_kernel(
+    N, block_size=128, in_dtype=BF16, out_dtype=FP8, scale_dtype=FP32,
+    round_scale=False, inplace=False
+):
+    """Block-wise FP8 quantization. inplace=True does fused quant+dequant back to BF16."""
+    M = T.symbolic("M")
+    fp8_min = -448.0
+    fp8_max = 448.0
+    fp8_max_inv = 1 / fp8_max
+    num_stages = 0 if round_scale or inplace else 2
+    blk_m = 32
+    group_size = block_size
+    # Internal computation in FP32; scale_dtype controls output storage format.
+    compute_dtype = FP32
+    out_dtype = in_dtype if inplace else out_dtype
+
+    @T.prim_func
+    def act_quant_kernel_(
+        X: T.Tensor[(M, N), in_dtype],
+        Y: T.Tensor[(M, N), out_dtype],
+        S: T.Tensor[(M, T.ceildiv(N, group_size)), scale_dtype],
+    ):
+        with T.Kernel(T.ceildiv(M, blk_m), T.ceildiv(N, group_size), threads=128) as (
+            pid_m,
+            pid_n,
+        ):
+            x_shared = T.alloc_shared((blk_m, group_size), in_dtype)
+            x_local = T.alloc_fragment((blk_m, group_size), in_dtype)
+            amax_local = T.alloc_fragment((blk_m,), compute_dtype)
+            s_local = T.alloc_fragment((blk_m,), compute_dtype)
+            y_local = T.alloc_fragment((blk_m, group_size), out_dtype)
+            y_shared = T.alloc_shared((blk_m, group_size), out_dtype)
+
+            for _ in T.Pipelined(1, num_stages=num_stages):
+                T.copy(X[pid_m * blk_m, pid_n * group_size], x_shared)
+                T.copy(x_shared, x_local)
+                T.reduce_absmax(x_local, amax_local, dim=1)
+                for i in T.Parallel(blk_m):
+                    amax_local[i] = T.max(amax_local[i], 1e-4)
+                    if round_scale:
+                        s_local[i] = fast_round_scale(amax_local[i], fp8_max_inv)
+                    else:
+                        s_local[i] = amax_local[i] * fp8_max_inv
+                if inplace:
+                    for i, j in T.Parallel(blk_m, group_size):
+                        y_local[i, j] = T.Cast(
+                            out_dtype,
+                            T.Cast(compute_dtype, T.Cast(out_dtype, T.clamp(
+                                x_local[i, j] / s_local[i], fp8_min, fp8_max
+                            ))) * s_local[i],
+                        )
+                else:
+                    for i, j in T.Parallel(blk_m, group_size):
+                        y_local[i, j] = T.clamp(
+                            x_local[i, j] / s_local[i], fp8_min, fp8_max
+                        )
+                for i in T.Parallel(blk_m):
+                    S[pid_m * blk_m + i, pid_n] = T.Cast(scale_dtype, s_local[i])
+                T.copy(y_local, y_shared)
+                T.copy(y_shared, Y[pid_m * blk_m, pid_n * group_size])
+
+    return act_quant_kernel_
+
+
+def act_quant(
+    x: torch.Tensor, block_size: int = 128, scale_fmt: Optional[str] = None,
+    scale_dtype: torch.dtype = torch.float32, inplace: bool = False,
+) -> torch.Tensor:
+    """Block-wise FP8 quantization. inplace=True does fused quant+dequant back to BF16.
+    When scale_fmt is set, scales are rounded to power-of-2 (MXFP)."""
+    N = x.size(-1)
+    assert N % block_size == 0
+    # tl_dtype = FE8M0 if scale_dtype == torch.float8_e8m0fnu else FP32
+    tl_dtype = FP32
+    z = x.contiguous()
+    y = torch.empty_like(z) if inplace else torch.empty_like(z, dtype=torch.float8_e4m3fn)
+    s = z.new_empty(*z.size()[:-1], N // block_size, dtype=scale_dtype)
+    kernel = act_quant_kernel(
+        N, block_size, scale_dtype=tl_dtype,
+        round_scale=scale_fmt is not None, inplace=inplace,
+    )
+    kernel(z.view(-1, N), y.view(-1, N), s.view(-1, N // block_size))
+    if inplace:
+        x.copy_(y)
+        return x
+    return y, s
+
+
+@tilelang.jit(pass_configs=pass_configs)
+def fp8_gemm_kernel(N, K, out_dtype=BF16, accum_dtype=FP32, scale_dtype=FP32):
+    assert out_dtype in [BF16, FP32]
+
+    M = T.symbolic("M")
+    group_size = 128
+    block_M = 32
+    block_N = 128
+    block_K = 128
+
+    @T.prim_func
+    def fp8_gemm_kernel_(
+        A: T.Tensor[(M, K), FP8],
+        B: T.Tensor[(N, K), FP8],
+        C: T.Tensor[(M, N), out_dtype],
+        scales_a: T.Tensor[(M, T.ceildiv(K, group_size)), scale_dtype],
+        scales_b: T.Tensor[(T.ceildiv(N, group_size), T.ceildiv(K, group_size)), scale_dtype],
+    ):
+        with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=128) as (
+            bx,
+            by,
+        ):
+            A_shared = T.alloc_shared((block_M, block_K), FP8)
+            B_shared = T.alloc_shared((block_N, block_K), FP8)
+            C_shared = T.alloc_shared((block_M, block_N), out_dtype)
+            Scale_C_shared = T.alloc_shared((block_M), FP32)
+            C_local = T.alloc_fragment((block_M, block_N), accum_dtype)
+            C_local_accum = T.alloc_fragment((block_M, block_N), accum_dtype)
+
+            # Improve L2 Cache
+            T.use_swizzle(panel_size=10)
+            T.clear(C_local)
+            T.clear(C_local_accum)
+
+            K_iters = T.ceildiv(K, block_K)
+            for k in T.Pipelined(K_iters, num_stages=2):
+                T.copy(A[by * block_M, k * block_K], A_shared)
+                T.copy(B[bx * block_N, k * block_K], B_shared)
+                # Cast scales to FP32 for computation; scales_b has one value per block_N group
+                Scale_B = T.Cast(FP32, scales_b[bx * block_N // group_size, k])
+                for i in T.Parallel(block_M):
+                    Scale_C_shared[i] = T.Cast(FP32, scales_a[by * block_M + i, k]) * Scale_B
+
+                T.gemm(A_shared, B_shared, C_local, transpose_B=True)
+                # Separate accumulator for scale-corrected results (2x accumulation precision)
+                for i, j in T.Parallel(block_M, block_N):
+                    C_local_accum[i, j] += C_local[i, j] * Scale_C_shared[i]
+                T.clear(C_local)
+            T.copy(C_local_accum, C_shared)
+            T.copy(C_shared, C[by * block_M, bx * block_N])
+
+    return fp8_gemm_kernel_
+
+
+def fp8_gemm(
+    a: torch.Tensor, a_s: torch.Tensor, b: torch.Tensor, b_s: torch.Tensor,
+    scale_dtype: torch.dtype = torch.float32,
+) -> torch.Tensor:
+    """C[M,N] = A[M,K] @ B[N,K]^T with per-128 block FP8 scaling on both A and B."""
+    assert a.is_contiguous() and b.is_contiguous(), "Input tensors must be contiguous"
+    assert a_s.is_contiguous() and b_s.is_contiguous(), (
+        "Scaling factor tensors must be contiguous"
+    )
+    # tl_dtype = FP32
+    tl_dtype = FE8M0 if scale_dtype == torch.float8_e8m0fnu else FP32
+    K = a.size(-1)
+    M = a.numel() // K
+    N = b.size(0)
+    c = a.new_empty(*a.size()[:-1], N, dtype=torch.get_default_dtype())
+    kernel = fp8_gemm_kernel(N, K, scale_dtype=tl_dtype)
+    kernel(a.view(M, K), b, c.view(M, N), a_s.view(M, -1), b_s)
+    return c
+
+
+@tilelang.jit(pass_configs=pass_configs)
+def sparse_attn_kernel(h_orig: int, d: int, scale=None):
+    """Sparse multi-head attention via index gathering + online softmax (FlashAttention-style).
+    For each (batch, seq_pos), gathers top-k KV positions by index, computes attention
+    with numerically stable running max/sum, and includes a learnable attn_sink bias."""
+    b = T.symbolic("b")
+    m = T.symbolic("m")
+    n = T.symbolic("n")
+    topk = T.symbolic("topk")
+    if scale is None:
+        scale = (1.0 / d) ** 0.5
+
+    num_stages = 0
+    threads = 256
+    block = 32
+    num_blocks = tilelang.cdiv(topk, block)
+
+    padded_H = max(tilelang.math.next_power_of_2(h_orig), 16)
+
+    max_block_m = 16
+    if h_orig > max_block_m:
+        assert h_orig % max_block_m == 0, f"h should be a multiple of {max_block_m}"
+        REPLICATE_H = h_orig // max_block_m
+    else:
+        REPLICATE_H = 1
+
+    h = padded_H if REPLICATE_H == 1 else max_block_m
+
+    @T.prim_func
+    def sparse_attn_kernel_(
+        q: T.Tensor[(b, m, h_orig, d), BF16],
+        kv: T.Tensor[(b, n, d), BF16],
+        o: T.Tensor[(b, m, h_orig, d), BF16],
+        attn_sink: T.Tensor[(h_orig,), FP32],
+        topk_idxs: T.Tensor[(b, m, topk), INT32],
+    ):
+        with T.Kernel(m * REPLICATE_H, b, threads=threads) as (bx, by):
+            q_shared = T.alloc_fragment((h, d), BF16)
+            kv_shared = T.alloc_shared((block, d), BF16)
+            # o_shared = T.alloc_shared((h, d), BF16)
+            acc_s_cast = T.alloc_shared((h, block), BF16)
+
+            idxs = T.alloc_fragment(block, INT32)
+            acc_s = T.alloc_fragment((h, block), FP32)
+            acc_o = T.alloc_fragment((h, d), FP32)
+            scores_max = T.alloc_fragment(h, FP32)
+            scores_max_prev = T.alloc_fragment(h, FP32)
+            scores_scale = T.alloc_fragment(h, FP32)
+            scores_sum = T.alloc_fragment(h, FP32)
+            sum_exp = T.alloc_fragment(h, FP32)
+
+            T.clear(acc_o)
+            T.clear(sum_exp)
+            T.fill(scores_max, -T.infinity(FP32))
+
+            s_i = bx if REPLICATE_H == 1 else (bx // REPLICATE_H)
+
+            H0 = (0 if REPLICATE_H == 1 else (bx % REPLICATE_H) * h)
+            H1 = H0 + h
+
+            T.copy(q[by, s_i, H0:H1, :], q_shared)
+
+            for t in T.Pipelined(num_blocks, num_stages=num_stages):
+                for i in T.Parallel(block):
+                    idxs[i] = T.if_then_else(t * block + i < topk, topk_idxs[by, s_i, t * block + i], -1)
+                for i, j in T.Parallel(block, d):
+                    kv_shared[i, j] = T.if_then_else(idxs[i] != -1, kv[by, idxs[i], j], 0)
+                for i, j in T.Parallel(h, block):
+                    acc_s[i, j] = T.if_then_else(idxs[j] != -1, 0, -T.infinity(FP32))
+                T.gemm(q_shared, kv_shared, acc_s, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
+                for i, j in T.Parallel(h, block):
+                    acc_s[i, j] *= scale
+                T.copy(scores_max, scores_max_prev)
+                T.reduce_max(acc_s, scores_max, dim=1, clear=False)
+                for i in T.Parallel(h):
+                    scores_scale[i] = T.exp(scores_max_prev[i] - scores_max[i])
+                for i, j in T.Parallel(h, block):
+                    acc_s[i, j] = T.exp(acc_s[i, j] - scores_max[i])
+                T.reduce_sum(acc_s, scores_sum, dim=1)
+                for i in T.Parallel(h):
+                    sum_exp[i] = sum_exp[i] * scores_scale[i] + scores_sum[i]
+                T.copy(acc_s, acc_s_cast)
+                for i, j in T.Parallel(h, d):
+                    acc_o[i, j] *= scores_scale[i]
+                T.gemm(acc_s_cast, kv_shared, acc_o, policy=T.GemmWarpPolicy.FullRow)
+
+            for i in T.Parallel(h):
+                sum_exp[i] += T.exp(attn_sink[i] - scores_max[i])
+            for i, j in T.Parallel(h, d):
+                acc_o[i, j] /= sum_exp[i]
+
+            o_shared = T.alloc_shared((h, d), BF16)
+            T.copy(acc_o, o_shared)
+            T.copy(o_shared, o[by, s_i, H0:H1, :])
+
+    return sparse_attn_kernel_
+
+
+def sparse_attn(
+    q: torch.Tensor, kv: torch.Tensor, attn_sink: torch.Tensor, topk_idxs: torch.Tensor, softmax_scale: float
+) -> torch.Tensor:
+    b, s, h, d = q.size()
+    # Pad heads to 16 for kernel efficiency (stripped after)
+    if h < 16:
+        q = torch.cat([q, q.new_zeros(b, s, 16 - h, d)], dim=2)
+        attn_sink = torch.cat([attn_sink, attn_sink.new_zeros(16 - h)])
+    o = torch.empty_like(q)
+    kernel = sparse_attn_kernel(q.size(2), d, softmax_scale)
+    kernel(q, kv, o, attn_sink, topk_idxs)
+    if h < 16:
+        o = o.narrow(2, 0, h).contiguous()
+    return o
+
+
+@tilelang.jit(pass_configs=pass_configs)
+def hc_split_sinkhorn_kernel(hc: int, sinkhorn_iters: int, eps: float):
+    n = T.symbolic("n")
+    mix_hc = (2 + hc) * hc
+    threads = 64
+
+    @T.prim_func
+    def hc_split_sinkhorn_kernel_(
+        mixes: T.Tensor[(n, mix_hc), FP32],
+        hc_scale: T.Tensor[(3,), FP32],
+        hc_base: T.Tensor[(mix_hc,), FP32],
+        pre: T.Tensor[(n, hc), FP32],
+        post: T.Tensor[(n, hc), FP32],
+        comb: T.Tensor[(n, hc, hc), FP32],
+    ):
+        with T.Kernel(n, threads=threads) as i:
+            mixes_shared = T.alloc_shared(mix_hc, FP32)
+            comb_frag = T.alloc_fragment((hc, hc), FP32)
+            T.copy(mixes[i, :], mixes_shared)
+
+            for j in T.Parallel(hc):
+                pre[i, j] = T.sigmoid(mixes_shared[j] * hc_scale[0] + hc_base[j]) + eps
+            for j in T.Parallel(hc):
+                post[i, j] = 2 * T.sigmoid(mixes_shared[j + hc] * hc_scale[1] + hc_base[j + hc])
+            for j, k in T.Parallel(hc, hc):
+                comb_frag[j, k] = mixes_shared[j * hc + k + hc * 2] * hc_scale[2] + hc_base[j * hc + k + hc * 2]
+
+            row_sum = T.alloc_fragment(hc, FP32)
+            col_sum = T.alloc_fragment(hc, FP32)
+
+            # comb = comb.softmax(-1) + eps
+            row_max = T.alloc_fragment(hc, FP32)
+            T.reduce_max(comb_frag, row_max, dim=1)
+            for j, k in T.Parallel(hc, hc):
+                comb_frag[j, k] = T.exp(comb_frag[j, k] - row_max[j])
+            T.reduce_sum(comb_frag, row_sum, dim=1)
+            for j, k in T.Parallel(hc, hc):
+                comb_frag[j, k] = comb_frag[j, k] / row_sum[j] + eps
+
+            # comb = comb / (comb.sum(-2) + eps)
+            T.reduce_sum(comb_frag, col_sum, dim=0)
+            for j, k in T.Parallel(hc, hc):
+                comb_frag[j, k] = comb_frag[j, k] / (col_sum[k] + eps)
+
+            for _ in T.serial(sinkhorn_iters - 1):
+                # comb = comb / (comb.sum(-1) + eps)
+                T.reduce_sum(comb_frag, row_sum, dim=1)
+                for j, k in T.Parallel(hc, hc):
+                    comb_frag[j, k] = comb_frag[j, k] / (row_sum[j] + eps)
+                # comb = comb / (comb.sum(-2) + eps)
+                T.reduce_sum(comb_frag, col_sum, dim=0)
+                for j, k in T.Parallel(hc, hc):
+                    comb_frag[j, k] = comb_frag[j, k] / (col_sum[k] + eps)
+
+            T.copy(comb_frag, comb[i, :, :])
+
+    return hc_split_sinkhorn_kernel_
+
+
+def hc_split_sinkhorn(mixes: torch.Tensor, hc_scale: torch.Tensor, hc_base: torch.Tensor, hc_mult: int = 4, sinkhorn_iters: int = 20, eps: float = 1e-6):
+    b, s, _ = mixes.size()
+    pre = mixes.new_empty(b, s, hc_mult)
+    post = mixes.new_empty(b, s, hc_mult)
+    comb = mixes.new_empty(b, s, hc_mult, hc_mult)
+    kernel = hc_split_sinkhorn_kernel(hc_mult, sinkhorn_iters, eps)
+    kernel(mixes.view(-1, (2 + hc_mult) * hc_mult), hc_scale, hc_base,
+           pre.view(-1, hc_mult), post.view(-1, hc_mult), comb.view(-1, hc_mult, hc_mult))
+    return pre, post, comb
--- a/inference-fp8/model.py
+++ b/inference-fp8/model.py
--- a/inference-fp8/start_torch_fp8.sh
+++ b/inference-fp8/start_torch_fp8.sh
+#!/bin/bash
+
+export NCCL_ALGO=Ring
+export NCCL_PROTO=Simple
+export MP=8
+export CONFIG=config.json
+
+torchrun --nproc-per-node ${MP} generate.py --ckpt-path /path/of/DeepSeek-V4-Flash-FP8-MP8 --config ${CONFIG} --interactive
--- a/model.properties
+++ b/model.properties
 # 模型唯一标识
 modelCode=2397
 # 模型名称
-modelName=DeepSeek-V4 
+modelName=DeepSeek-V4
 # 模型描述
-modelDescription= DeepSeek-V4：迈向高效百万上下文智能。
+modelDescription=DeepSeek-V4：迈向高效百万上下文智能。
 # 运行过程
 processType=推理
 # 算法类别