Add experimental version

715e39c4 · chenych · 1fac49dc · 715e39c4 · 1fac49dc · 715e39c4
Commit 715e39c4 authored Oct 03, 2025 by chenych
13 changed files
--- a/README.md
+++ b/README.md
@@ -24,6 +24,11 @@ DCU型号：K100AI,节点数量：4台,卡数：32 张。
 dcoker pull image.sourcefind.cn:5000/dcu/admin/base/vllm:0.9.2-ubuntu22.04-dtk25.04.1-rc5-rocblas104381-0915-das1.6-py3.10-20250916-rc2
 docker run -it --shm-size 200g --network=host --name {docker_name} --privileged --device=/dev/kfd --device=/dev/dri --device=/dev/mkfd --group-add video --cap-add=SYS_PTRACE --security-opt seccomp=unconfined -u root -v /path/your_code_data/:/path/your_code_data/ -v /opt/hyhal/:/opt/hyhal/:ro {imageID} bash

+## 安装transformers
+git clone -b add-deepseek-exp https://github.com/huggingface/transformers.git
+cd transformers
+pip install -e .
+
 cd /your_code_path/deepseek-v3.2-exp_pytorch

 ```
@@ -35,6 +40,11 @@ docker build --no-cache -t deepseek-v3.2-exp:latest .

 docker run -it --shm-size 200g --network=host --name {docker_name} --privileged --device=/dev/kfd --device=/dev/dri --device=/dev/mkfd --group-add video --cap-add=SYS_PTRACE --security-opt seccomp=unconfined -u root -v /path/your_code_data/:/path/your_code_data/ -v /opt/hyhal/:/opt/hyhal/:ro {imageID} bash

+## 安装transformers
+git clone -b add-deepseek-exp https://github.com/huggingface/transformers.git
+cd transformers
+pip install -e .
+
 cd /your_code_path/deepseek-v3.2-exp_pytorch

 ```
@@ -45,10 +55,17 @@ cd /your_code_path/deepseek-v3.2-exp_pytorch
 DTK: 25.04.1
 python: 3.10.12
 torch: 2.5.1+das.opt1.dtk25041
-vllm: 0.9.2+das.opt1.rc2.dtk25041
-transformers: 4.55.0
 ```
-`Tips：以上dtk驱动、pytorch等DCU相关工具版本需要严格一一对应`
+`Tips：以上dtk驱动、pytorch等DCU相关工具版本需要严格一一对应`, 其他组件安装方法如下：
+```bash
+## 安装transformers
+git clone -b add-deepseek-exp https://github.com/huggingface/transformers.git
+cd transformers
+pip install -e .
+
+cd /your_code_path/deepseek-v3.2-exp_pytorch
+
+```

 ## 数据集
 无
@@ -57,32 +74,113 @@ transformers: 4.55.0
 暂无

 ## 推理
-1. 首先将模型转换成bf16格式
+样例模型：[DeepSeek-V3.2-Exp](https://huggingface.co/deepseek-ai/DeepSeek-V3.2-Exp)
+
+首先将模型转换成bf16格式
 ```bash
-cd inference
 # fp8转bf16
-python fp8_cast_bf16.py --input-fp8-hf-path /path/to/DeepSeek-V3.2-Exp --output-bf16-hf-path /path/to/DeepSeek-V3.2-Exp-bf16
+python inference/fp8_cast_bf16.py --input-fp8-hf-path /path/to/DeepSeek-V3.2-Exp --output-bf16-hf-path /path/to/DeepSeek-V3.2-Exp-bf16
+```
+
+### vllm推理方法
+#### server 多机
+1. 加入环境变量
+> 请注意：
+> 每个节点上的环境变量都写到.sh文件中，保存后各个计算节点分别source `.sh` 文件
+>
+> VLLM_HOST_IP：节点本地通信口ip，尽量选择IB网卡的IP，**避免出现rccl超时问题**
+>
+> NCCL_SOCKET_IFNAME和GLOO_SOCKET_IFNAME：节点本地通信网口ip对应的名称
+>
+> 通信口和ip查询方法：ifconfig
+>
+> IB口状态查询：ibstat  !!!一定要active激活状态才可用，各个节点要保持统一
+
+<div align=center>
+    <img src="./doc/ip_bw.png"/>
+</div>
+
+```bash
+export ALLREDUCE_STREAM_WITH_COMPUTE=1
+export VLLM_HOST_IP=x.x.x.x # 对应计算节点的IP，建议选择IB口SOCKET_IFNAME对应IP地址
+export NCCL_SOCKET_IFNAME=ibxxxx
+export GLOO_SOCKET_IFNAME=ibxxxx
+export NCCL_IB_HCA=mlx5_0:1 # 环境中的IB网卡名字
+unset NCCL_ALGO
+export NCCL_MIN_NCHANNELS=16
+export NCCL_MAX_NCHANNELS=16
+export NCCL_NET_GDR_READ=1
+export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+
+# 海光CPU绑定核
+export VLLM_NUMA_BIND=1
+export VLLM_RANK0_NUMA=0
+export VLLM_RANK1_NUMA=1
+export VLLM_RANK2_NUMA=2
+export VLLM_RANK3_NUMA=3
+export VLLM_RANK4_NUMA=4
+export VLLM_RANK5_NUMA=5
+export VLLM_RANK6_NUMA=6
+export VLLM_RANK7_NUMA=7
+#BW集群需要额外设置的环境变量：
+export NCCL_NET_GDR_LEVEL=7
+export NCCL_SDMA_COPY_ENABLE=0
+export VLLM_RPC_TIMEOUT=1800000
 ```

-2. 进行模型划分
+2. 启动RAY集群
+> x.x.x.x 对应第一步 Master节点的 VLLM_HOST_IP
+
 ```bash
-python convert.py --hf-ckpt-path /path/to/DeepSeek-V3.2-Exp-bf16 --save-path /path/to/DeepSeek-V3.2-Demo --n-experts 256 --model-parallel 32
+# head节点执行
+ray start --head --node-ip-address=x.x.x.x --port=6379 --num-gpus=8 --num-cpus=32
+# worker节点执行
+ray start --address='x.x.x.x:6379' --num-gpus=8 --num-cpus=32
 ```
-> 注意：需要将/path/to/DeepSeek-V3.2-Exp中的json文件复制到/path/to/DeepSeek-V3.2-Demo中。

-3. 启动推理
+3. 启动vllm server
+> intel cpu 需要加参数：`--enforce-eager`
+
 ```bash
-export NCCL_ALGO=Ring
-export NCCL_PROTO=Simple
-# chat
-torchrun --nnodes 4 --nproc-per-node 8 --node-rank $RANK --master-addr $ADDR generate.py --ckpt-path /path/to/DeepSeek-V3-Demo --config config_671B_v3.2.json --interactive --temperature 0.7 --max-new-tokens 200
+vllm serve deepseek-v3.2/DeepSeek-V3.2-Exp-bf16 \
+    --enforce-eager \
+    --trust-remote-code \
+    --distributed-executor-backend ray \
+    --dtype bfloat16 \
+    --tensor-parallel-size 32 \
+    --max-model-len 32768 \
+    --max-num-seqs 128 \
+    --no-enable-chunked-prefill \
+    --no-enable-prefix-caching \
+    --gpu-memory-utilization 0.85 \
+    --host 127.0.0.1 \
+    --port 8001 \
+    --kv-cache-dtype bfloat16
 ```

-## result
+启动完成后可通过以下方式访问：
+```bash
+curl http://127.0.0.1:8001/v1/chat/completions \
+    -H "Content-Type: application/json" \
+    -d '{
+        "model": "deepseek-ai/DeepSeek-V3.2-Exp",
+        "messages": [
+            {
+                "role": "user",
+                "content": "Explain Machine Learning to me in a nutshell."
+            }
+        ],
+        "temperature": 0.15,
+        "top_p": 1.0,
+        "max_tokens": 2048,
+        "stream": false
+}'
+```

+## result

 ### 精度
-
+DCU与GPU精度一致，推理框架：vllm。

 ## 应用场景
 ### 算法类别
@@ -93,6 +191,7 @@ torchrun --nnodes 4 --nproc-per-node 8 --node-rank $RANK --master-addr $ADDR gen

 ## 预训练权重
 - [DeepSeek-V3.2-Exp](https://huggingface.co/deepseek-ai/DeepSeek-V3.2-Exp)
+- [DeepSeek-V3.2-Exp-Base](https://huggingface.co/deepseek-ai/DeepSeek-V3.2-Exp-Base)

 ## 源码仓库及问题反馈
 - https://developer.sourcefind.cn/codes/modelzoo/deepseek-v3.2-exp_pytorch

--- a/doc/arch.png
+++ b/doc/arch.png
--- a/doc/results-dcu.jpg
+++ b/doc/results-dcu.jpg
--- a/doc/results-nv.png
+++ b/doc/results-nv.png
--- a/inference/README.md
+++ b/inference/README.md
-# DeepSeek V3.2
-
-First convert huggingface model weights to the the format required by our inference demo. Set `MP` to match your available GPU count:
-```bash
-cd inference
-export EXPERTS=256
-python convert.py --hf-ckpt-path ${HF_CKPT_PATH} --save-path ${SAVE_PATH} --n-experts ${EXPERTS} --model-parallel ${MP}
-```
-
-Launch the interactive chat interface and start exploring DeepSeek's capabilities:
-```bash
-export CONFIG=config_671B_v3.2.json
-torchrun --nproc-per-node ${MP} generate.py --ckpt-path ${SAVE_PATH} --config ${CONFIG} --interactive
-```
\ No newline at end of file
--- a/inference/config_671B_v3.2.json
+++ b/inference/config_671B_v3.2.json
-{
-    "vocab_size": 129280,
-    "dim": 7168,
-    "inter_dim": 18432,
-    "moe_inter_dim": 2048,
-    "n_layers": 61,
-    "n_dense_layers": 3,
-    "n_heads": 128,
-    "n_routed_experts": 256,
-    "n_shared_experts": 1,
-    "n_activated_experts": 8,
-    "n_expert_groups": 8,
-    "n_limited_groups": 4,
-    "route_scale": 2.5,
-    "score_func": "sigmoid",
-    "q_lora_rank": 1536,
-    "kv_lora_rank": 512,
-    "qk_nope_head_dim": 128,
-    "qk_rope_head_dim": 64,
-    "v_head_dim": 128,
-    "dtype": "bf16",
-    "scale_fmt": "ue8m0",
-    "index_n_heads": 64,
-    "index_head_dim": 128,
-    "index_topk": 2048
-}
\ No newline at end of file
--- a/inference/convert.py
+++ b/inference/convert.py
-import os
-import shutil
-from argparse import ArgumentParser
-from glob import glob
-from tqdm import tqdm, trange
-
-import torch
-from safetensors.torch import safe_open, save_file
-
-
-mapping = {
-    "embed_tokens": ("embed", 0),
-    "input_layernorm": ("attn_norm", None),
-    "post_attention_layernorm": ("ffn_norm", None),
-    "q_proj": ("wq", 0),
-    "q_a_proj": ("wq_a", None),
-    "q_a_layernorm": ("q_norm", None),
-    "q_b_proj": ("wq_b", 0),
-    "kv_a_proj_with_mqa": ("wkv_a", None),
-    "kv_a_layernorm": ("kv_norm", None),
-    "kv_b_proj": ("wkv_b", 0),
-    "o_proj": ("wo", 1),
-    "gate": ("gate", None),
-    "gate_proj": ("w1", 0),
-    "down_proj": ("w2", 1),
-    "up_proj": ("w3", 0),
-    "norm": ("norm", None),
-    "lm_head": ("head", 0),
-    "scale": ("scale", None),
-    "wq_b": ("wq_b", None),
-    "wk": ("wk", None),
-    "k_norm": ("k_norm", None),
-    "weights_proj": ("weights_proj", None),
-}
-
-
-def main(hf_ckpt_path, save_path, n_experts, mp):
-    """
-    Converts and saves model checkpoint files into a specified format.
-
-    Args:
-        hf_ckpt_path (str): Path to the directory containing the input checkpoint files.
-        save_path (str): Path to the directory where the converted checkpoint files will be saved.
-        n_experts (int): Total number of experts in the model.
-        mp (int): Model parallelism factor.
-        
-    Returns:
-        None
-    """
-    torch.set_num_threads(8)
-    n_local_experts = n_experts // mp
-    state_dicts = [{} for _ in range(mp)]
-
-    for file_path in tqdm(glob(os.path.join(hf_ckpt_path, "*.safetensors"))):
-        with safe_open(file_path, framework="pt", device="cpu") as f:
-            for name in f.keys():
-                if "model.layers.61" in name:
-                    continue
-                param: torch.Tensor = f.get_tensor(name)
-                if name.startswith("model."):
-                    name = name[len("model."):]
-                name = name.replace("self_attn", "attn")
-                name = name.replace("mlp", "ffn")
-                name = name.replace("weight_scale_inv", "scale")
-                name = name.replace("e_score_correction_bias", "bias")
-                key = name.split(".")[-2]
-                assert key in mapping, f"Key {key} not found in mapping"
-                new_key, dim = mapping[key]
-                name = name.replace(key, new_key)
-                for i in range(mp):
-                    new_param = param
-                    if "experts" in name and "shared_experts" not in name:
-                        idx = int(name.split(".")[-3])
-                        if idx < i * n_local_experts or idx >= (i + 1) * n_local_experts:
-                            continue
-                    elif dim is not None:
-                        assert param.size(dim) % mp == 0, f"Dimension {dim} must be divisible by {mp}"
-                        shard_size = param.size(dim) // mp
-                        new_param = param.narrow(dim, i * shard_size, shard_size).contiguous()
-                    state_dicts[i][name] = new_param
-
-    os.makedirs(save_path, exist_ok=True)
-
-    for i in trange(mp):
-        save_file(state_dicts[i], os.path.join(save_path, f"model{i}-mp{mp}.safetensors"))
-
-    for file_path in glob(os.path.join(hf_ckpt_path, "*token*")):
-        new_file_path = os.path.join(save_path, os.path.basename(file_path))
-        shutil.copyfile(file_path, new_file_path)
-
-
-if __name__ == "__main__":
-    parser = ArgumentParser()
-    parser.add_argument("--hf-ckpt-path", type=str, required=True)
-    parser.add_argument("--save-path", type=str, required=True)
-    parser.add_argument("--n-experts", type=int, required=True)
-    parser.add_argument("--model-parallel", type=int, required=True)
-    args = parser.parse_args()
-    assert args.n_experts % args.model_parallel == 0, "Number of experts must be divisible by model parallelism"
-    main(args.hf_ckpt_path, args.save_path, args.n_experts, args.model_parallel)
--- a/inference/generate.py
+++ b/inference/generate.py
-import os
-import json
-from argparse import ArgumentParser
-from typing import List
-
-import torch
-import torch.distributed as dist
-from transformers import AutoTokenizer
-from safetensors.torch import load_model
-
-from model import Transformer, ModelArgs
-
-
-def sample(logits, temperature: float = 1.0):
-    """
-    Samples a token from the logits using temperature scaling.
-
-    Args:
-        logits (torch.Tensor): The logits tensor for token predictions.
-        temperature (float, optional): Temperature for scaling logits. Defaults to 1.0.
-
-    Returns:
-        torch.Tensor: The sampled token.
-    """
-    logits = logits / max(temperature, 1e-5)
-    probs = torch.softmax(logits, dim=-1, dtype=torch.float32)
-    return probs.div_(torch.empty_like(probs).exponential_(1)).argmax(dim=-1)
-
-
-@torch.inference_mode()
-def generate(
-    model: Transformer,
-    prompt_tokens: List[List[int]],
-    max_new_tokens: int,
-    eos_id: int,
-    temperature: float = 1.0
-) -> List[List[int]]:
-    """
-    Generates new tokens based on the given prompt tokens using the specified model.
-
-    Args:
-        model (Transformer): The transformer model used for token generation.
-        prompt_tokens (List[List[int]]): A list of lists containing the prompt tokens for each sequence.
-        max_new_tokens (int): The maximum number of new tokens to generate.
-        eos_id (int): The end-of-sequence token ID.
-        temperature (float, optional): The temperature value for sampling. Defaults to 1.0.
-
-    Returns:
-        List[List[int]]: A list of lists containing the generated tokens for each sequence.
-    """
-    prompt_lens = [len(t) for t in prompt_tokens]
-    assert max(prompt_lens) <= model.max_seq_len, f"Prompt length exceeds model maximum sequence length (max_seq_len={model.max_seq_len})"
-    total_len = min(model.max_seq_len, max_new_tokens + max(prompt_lens))
-    tokens = torch.full((len(prompt_tokens), total_len), -1, dtype=torch.long, device="cuda")
-    for i, t in enumerate(prompt_tokens):
-        tokens[i, :len(t)] = torch.tensor(t, dtype=torch.long, device="cuda")
-    prev_pos = 0
-    finished = torch.tensor([False] * len(prompt_tokens), device="cuda")
-    prompt_mask = tokens != -1
-    for cur_pos in range(min(prompt_lens), total_len):
-        logits = model.forward(tokens[:, prev_pos:cur_pos], prev_pos)
-        if temperature > 0:
-            next_token = sample(logits, temperature)
-        else:
-            next_token = logits.argmax(dim=-1)
-        next_token = torch.where(prompt_mask[:, cur_pos], tokens[:, cur_pos], next_token)
-        tokens[:, cur_pos] = next_token
-        finished |= torch.logical_and(~prompt_mask[:, cur_pos], next_token == eos_id)
-        prev_pos = cur_pos
-        if finished.all():
-            break
-    completion_tokens = []
-    for i, toks in enumerate(tokens.tolist()):
-        toks = toks[prompt_lens[i]:prompt_lens[i]+max_new_tokens]
-        if eos_id in toks:
-            toks = toks[:toks.index(eos_id)]
-        completion_tokens.append(toks)
-    return completion_tokens
-
-
-def main(
-    ckpt_path: str,
-    config: str,
-    input_file: str = "",
-    interactive: bool = True,
-    max_new_tokens: int = 100,
-    temperature: float = 1.0,
-) -> None:
-    """
-    Main function to load the model and perform interactive or batch text generation.
-
-    Args:
-        ckpt_path (str): Path to the model checkpoint directory.
-        config (str): Path to the model configuration file.
-        input_file (str, optional): Path to a file containing input prompts. Defaults to "".
-        interactive (bool, optional): Whether to run in interactive mode. Defaults to True.
-        max_new_tokens (int, optional): Maximum number of new tokens to generate. Defaults to 100.
-        temperature (float, optional): Temperature for sampling. Defaults to 1.0.
-    """
-    world_size = int(os.getenv("WORLD_SIZE", "1"))
-    rank = int(os.getenv("RANK", "0"))
-    local_rank = int(os.getenv("LOCAL_RANK", "0"))
-    if world_size > 1:
-        dist.init_process_group("nccl")
-    global print
-    if rank != 0:
-        print = lambda *_, **__: None
-    torch.cuda.set_device(local_rank)
-    torch.set_default_dtype(torch.bfloat16)
-    torch.set_num_threads(8)
-    torch.manual_seed(33377335)
-    with open(config) as f:
-        args = ModelArgs(**json.load(f))
-    print(args)
-    with torch.device("cuda"):
-        model = Transformer(args)
-    tokenizer = AutoTokenizer.from_pretrained(ckpt_path)
-    print("load model")
-    load_model(model, os.path.join(ckpt_path, f"model{rank}-mp{world_size}.safetensors"))
-    print("I'm DeepSeek 👋")
-
-    if interactive:
-        messages = []
-        while True:
-            if world_size == 1:
-                prompt = input(">>> ")
-            elif rank == 0:
-                prompt = input(">>> ")
-                objects = [prompt]
-                dist.broadcast_object_list(objects, 0)
-            else:
-                objects = [None]
-                dist.broadcast_object_list(objects, 0)
-                prompt = objects[0]
-            if prompt == "/exit":
-                break
-            elif prompt == "/clear":
-                messages.clear()
-                continue
-            messages.append({"role": "user", "content": prompt})
-            prompt_tokens = tokenizer.apply_chat_template(messages, add_generation_prompt=True)
-            completion_tokens = generate(model, [prompt_tokens], max_new_tokens, tokenizer.eos_token_id, temperature)
-            completion = tokenizer.decode(completion_tokens[0], skip_special_tokens=True)
-            print(completion)
-            messages.append({"role": "assistant", "content": completion})
-    else:
-        with open(input_file) as f:
-            prompts = f.read().split("\n\n")
-        assert len(prompts) <= args.max_batch_size, f"Number of prompts exceeds maximum batch size ({args.max_batch_size})"
-        prompt_tokens = [tokenizer.apply_chat_template([{"role": "user", "content": prompt}], add_generation_prompt=True) for prompt in prompts]
-        completion_tokens = generate(model, prompt_tokens, max_new_tokens, tokenizer.eos_token_id, temperature)
-        completions = tokenizer.batch_decode(completion_tokens, skip_special_tokens=True)
-        for prompt, completion in zip(prompts, completions):
-            print("Prompt:", prompt)
-            print("Completion:", completion)
-            print()
-
-    if world_size > 1:
-        dist.destroy_process_group()
-
-
-if __name__ == "__main__":
-    """
-    Command-line interface for distributed text generation.
-
-    Arguments:
-        --ckpt-path (str): Path to the model checkpoint directory.
-        --config (str): Path to the model configuration file.
-        --input-file (str, optional): File containing prompts for batch processing.
-        --interactive (bool, optional): Enable interactive mode for generating text.
-        --max-new-tokens (int, optional): Maximum number of new tokens to generate. Defaults to 200.
-        --temperature (float, optional): Temperature for sampling. Defaults to 0.2.
-
-    Raises:
-        AssertionError: If neither input-file nor interactive mode is specified.
-    """
-    parser = ArgumentParser()
-    parser.add_argument("--ckpt-path", type=str, required=True)
-    parser.add_argument("--config", type=str, required=True)
-    parser.add_argument("--input-file", type=str, default="")
-    parser.add_argument("--interactive", action="store_true")
-    parser.add_argument("--max-new-tokens", type=int, default=200)
-    parser.add_argument("--temperature", type=float, default=0.6)
-    args = parser.parse_args()
-    assert args.input_file or args.interactive, "Either input-file or interactive mode must be specified"
-    main(args.ckpt_path, args.config, args.input_file, args.interactive, args.max_new_tokens, args.temperature)
--- a/inference/kernel.py
+++ b/inference/kernel.py
-import torch
-import tilelang
-import tilelang.language as T
-from typing import Tuple, Optional
-
-
-tilelang.set_log_level("WARNING")
-
-pass_configs = {
-    tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
-    tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
-    tilelang.PassConfigKey.TL_DISABLE_FAST_MATH: True,
-}
-
-FP8 = "float8_e4m3"
-BF16 = "bfloat16"
-FP32 = "float32"
-
-
-def fast_log2_ceil(x):
-    bits_x = T.reinterpret("uint32", x)
-    exp_x = (bits_x >> 23) & 0xFF
-    man_bits = bits_x & ((1 << 23) - 1)
-    return T.Cast("int32", exp_x - 127 + T.if_then_else(man_bits != 0, 1, 0))
-
-
-def fast_pow2(x):
-    bits_x = (x + 127) << 23
-    return T.reinterpret("float32", bits_x)
-
-
-def fast_round_scale(amax, fp8_max_inv):
-    return fast_pow2(fast_log2_ceil(amax * fp8_max_inv))
-
+from typing import Tuple

-@tilelang.jit(pass_configs=pass_configs)
-def act_quant_kernel(
-    N, in_dtype=BF16, out_dtype=FP8, scale_dtype=FP32, round_scale=False
-):
-    M = T.symbolic("M")
-    fp8_min = -448.0
-    fp8_max = 448.0
-    fp8_max_inv = 1 / fp8_max
-    num_stages = 0 if round_scale else 2
-    blk_m = 32
-    group_size = 128
-
-    @T.prim_func
-    def act_quant_kernel_(
-        X: T.Tensor[(M, N), in_dtype],
-        Y: T.Tensor[(M, N), out_dtype],
-        S: T.Tensor[(M, T.ceildiv(N, group_size)), scale_dtype],
-    ):
-        with T.Kernel(T.ceildiv(M, blk_m), T.ceildiv(N, group_size), threads=128) as (
-            pid_m,
-            pid_n,
-        ):
-            x_shared = T.alloc_shared((blk_m, group_size), in_dtype)
-            x_local = T.alloc_fragment((blk_m, group_size), in_dtype)
-            amax_local = T.alloc_fragment((blk_m,), scale_dtype)
-            s_local = T.alloc_fragment((blk_m,), scale_dtype)
-            y_local = T.alloc_fragment((blk_m, group_size), out_dtype)
-            y_shared = T.alloc_shared((blk_m, group_size), out_dtype)
-
-            for _ in T.Pipelined(1, num_stages=num_stages):
-                T.copy(X[pid_m * blk_m, pid_n * group_size], x_shared)
-                T.copy(x_shared, x_local)
-                T.reduce_absmax(x_local, amax_local, dim=1)
-                for i in T.Parallel(blk_m):
-                    amax_local[i] = T.max(amax_local[i], 1e-4)
-                    if round_scale:
-                        s_local[i] = fast_round_scale(amax_local[i], fp8_max_inv)
-                    else:
-                        s_local[i] = amax_local[i] * fp8_max_inv
-                for i, j in T.Parallel(blk_m, group_size):
-                    y_local[i, j] = T.clamp(
-                        x_local[i, j] / s_local[i], fp8_min, fp8_max
-                    )
-                for i in T.Parallel(blk_m):
-                    S[pid_m * blk_m + i, pid_n] = s_local[i]
-                T.copy(y_local, y_shared)
-                T.copy(y_shared, Y[pid_m * blk_m, pid_n * group_size])
-
-    return act_quant_kernel_
-
-
-def act_quant(
-    x: torch.Tensor, block_size: int = 128, scale_fmt: Optional[str] = None
-) -> Tuple[torch.Tensor, torch.Tensor]:
-    """
-    Quantizes the input tensor `x` using block-wise quantization.
-
-    Args:
-        x (torch.Tensor): The input tensor to be quantized. Must be contiguous and its last dimension size must be divisible by `block_size`.
-        block_size (int, optional): The size of the blocks to be used for quantization. Default is 128.
-        scale_fmt (Optional[str], optional): The format of the scale. Default is None.
-    Returns:
-        Tuple[torch.Tensor, torch.Tensor]: A tuple containing:
-            - The quantized tensor with dtype `torch.float8_e4m3fn`.
-            - A tensor of scaling factors with dtype `torch.float32`.
-    """
-    assert x.is_contiguous(), "Input tensor must be contiguous"
-    assert x.size(-1) % block_size == 0, (
-        f"Last dimension size must be divisible by block_size (block_size={block_size})"
-    )
-    N = x.size(-1)
+import torch
+import triton
+import triton.language as tl
+from triton import Config
+
+
+@triton.jit
+def act_quant_kernel(x_ptr, y_ptr, s_ptr, BLOCK_SIZE: tl.constexpr):
+    pid = tl.program_id(axis=0)
+    offs = pid * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE)
+    x = tl.load(x_ptr + offs).to(tl.float32)
+    s = tl.max(tl.abs(x)) / 448.
+    y = x / s
+    y = y.to(y_ptr.dtype.element_ty)
+    tl.store(y_ptr + offs, y)
+    tl.store(s_ptr + pid, s)
+
+
+def act_quant(x: torch.Tensor, block_size: int = 128) -> Tuple[torch.Tensor, torch.Tensor]:
+    assert x.is_contiguous()
+    assert x.size(-1) % block_size == 0
    y = torch.empty_like(x, dtype=torch.float8_e4m3fn)
-    s = x.new_empty(*x.size()[:-1], N // block_size, dtype=torch.float32)
-    kernel = act_quant_kernel(N, round_scale=scale_fmt is not None)
-    kernel(x.view(-1, N), y.view(-1, N), s.view(-1, N // block_size))
+    s = x.new_empty(*x.size()[:-1], x.size(-1) // block_size, dtype=torch.float32)
+    grid = lambda meta: (triton.cdiv(x.numel(), meta['BLOCK_SIZE']), )
+    act_quant_kernel[grid](x, y, s, BLOCK_SIZE=block_size)
    return y, s


-@tilelang.jit(pass_configs=pass_configs)
-def fp8_gemm_kernel(N, K, out_dtype=BF16, accum_dtype="float32"):
-    assert out_dtype in [BF16, "float32"]
-
-    M = T.symbolic("M")
-    group_size = 128
-    block_M = 32
-    block_N = 128
-    block_K = 128
-
-    @T.prim_func
-    def fp8_gemm_kernel_(
-        A: T.Tensor[(M, K), FP8],
-        B: T.Tensor[(N, K), FP8],
-        C: T.Tensor[(M, N), out_dtype],
-        scales_a: T.Tensor[(M, T.ceildiv(K, group_size)), FP32],
-        scales_b: T.Tensor[(T.ceildiv(N, group_size), T.ceildiv(K, group_size)), FP32],
-    ):
-        with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=128) as (
-            bx,
-            by,
-        ):
-            A_shared = T.alloc_shared((block_M, block_K), FP8)
-            B_shared = T.alloc_shared((block_N, block_K), FP8)
-            C_shared = T.alloc_shared((block_M, block_N), out_dtype)
-            Scale_C_shared = T.alloc_shared((block_M), FP32)
-            C_local = T.alloc_fragment((block_M, block_N), accum_dtype)
-            C_local_accum = T.alloc_fragment((block_M, block_N), accum_dtype)
-
-            # Improve L2 Cache
-            T.use_swizzle(panel_size=10)
-
-            T.clear(C_local)
-            T.clear(C_local_accum)
-            K_iters = T.ceildiv(K, block_K)
-            for k in T.Pipelined(K_iters, num_stages=4):
-                # Load A into shared memory
-                T.copy(A[by * block_M, k * block_K], A_shared)
-                # Load B into shared memory
-                T.copy(B[bx * block_N, k * block_K], B_shared)
-                # Load scale into shared memory
-                Scale_B = scales_b[bx * block_N // group_size, k]
-                for i in T.Parallel(block_M):
-                    Scale_C_shared[i] = scales_a[by * block_M + i, k] * Scale_B
-
-                T.gemm(A_shared, B_shared, C_local, transpose_B=True)
-                # Promote to enable 2xAcc
-                for i, j in T.Parallel(block_M, block_N):
-                    C_local_accum[i, j] += C_local[i, j] * Scale_C_shared[i]
-                T.clear(C_local)
-            # TMA store
-            T.copy(C_local_accum, C_shared)
-            T.copy(C_shared, C[by * block_M, bx * block_N])
-
-    return fp8_gemm_kernel_
-
-
-def fp8_gemm(
-    a: torch.Tensor, a_s: torch.Tensor, b: torch.Tensor, b_s: torch.Tensor
-) -> torch.Tensor:
-    """
-    Perform a matrix multiplication using FP8 precision.
-
-    Args:
-        a (torch.Tensor): The first input matrix, must be contiguous.
-        a_s (torch.Tensor): The scaling factor for the first input matrix, must be contiguous.
-        b (torch.Tensor): The second input matrix, must be contiguous.
-        b_s (torch.Tensor): The scaling factor for the second input matrix, must be contiguous.
-
-    Returns:
-        torch.Tensor: The result of the matrix multiplication.
-    """
-    assert a.is_contiguous() and b.is_contiguous(), "Input tensors must be contiguous"
-    assert a_s.is_contiguous() and b_s.is_contiguous(), (
-        "Scaling factor tensors must be contiguous"
-    )
+# @triton.jit
+# def weight_dequant_kernel(x_ptr, s_ptr, y_ptr, M, N, BLOCK_SIZE: tl.constexpr):
+#     pid_m = tl.program_id(axis=0)
+#     pid_n = tl.program_id(axis=1)
+#     n = tl.cdiv(N, BLOCK_SIZE)
+#     offs_m = pid_m * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE)
+#     offs_n = pid_n * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE)
+#     offs = offs_m[:, None] * N + offs_n[None, :]
+#     mask = (offs_m[:, None] < M) & (offs_n[None, :] < N)
+#     x = tl.load(x_ptr + offs, mask=mask).to(tl.float32)
+#     s = tl.load(s_ptr + pid_m * n + pid_n)
+#     y = x * s
+#     tl.store(y_ptr + offs, y, mask=mask)
+
+
+# def weight_dequant(x: torch.Tensor, s: torch.Tensor, block_size: int = 128) -> torch.Tensor:
+#     assert x.is_contiguous() and s.is_contiguous()
+#     assert x.dim() == 2 and s.dim() == 2
+#     M, N = x.size()
+#     y = torch.empty_like(x, dtype=torch.get_default_dtype())
+#     grid = lambda meta: (triton.cdiv(M, meta['BLOCK_SIZE']), triton.cdiv(N, meta['BLOCK_SIZE']))
+#     weight_dequant_kernel[grid](x, s, y, M, N, BLOCK_SIZE=block_size)
+#     return y
+
+
+def weight_dequant(x: torch.Tensor, s: torch.Tensor, block_size: int = 128) -> torch.Tensor:
+    assert x.is_contiguous() and s.is_contiguous()
+    assert x.dim() == 2 and s.dim() == 2
+    M, N = x.size()
+    y = torch.empty_like(x, dtype=torch.get_default_dtype())
+
+    # 计算 s 的目标形状
+    s_M = (M + block_size - 1) // block_size  # 向上取整
+    s_N = (N + block_size - 1) // block_size  # 向上取整
+
+    # 检查 s 的形状是否正确
+    assert s.size(0) == s_M and s.size(1) == s_N, \
+        f"s 的形状应为 ({s_M}, {s_N}), 但实际为 {s.size()}"
+
+    # 将 s 扩展到与 x 相同的形状
+    s_expanded = s.repeat_interleave(block_size, dim=0).repeat_interleave(block_size, dim=1)
+
+    # 裁剪 s_expanded 以匹配 x 的形状
+    s_expanded = s_expanded[:M, :N]
+
+    # 逐元素乘法
+    y = x.to(torch.float32) * s_expanded
+    
+    y = y.to(torch.bfloat16)
+
+    return y
+
+
+fp8_gemm_configs = [
+    Config({'BLOCK_SIZE_M': block_m, 'BLOCK_SIZE_N': block_n, 'BLOCK_SIZE_K': 128}, num_stages=num_stages, num_warps=8)
+    for block_m in [16, 32, 64] for block_n in [32, 64, 128] for num_stages in [3, 4, 5, 6]
+]
+
+@triton.autotune(configs=fp8_gemm_configs, key=['N', 'K'])
+@triton.jit
+def fp8_gemm_kernel(a_ptr, b_ptr, c_ptr,
+                    a_s_ptr, b_s_ptr,
+                    M, N: tl.constexpr, K: tl.constexpr,
+                    BLOCK_SIZE_M: tl.constexpr,
+                    BLOCK_SIZE_N: tl.constexpr,
+                    BLOCK_SIZE_K: tl.constexpr):
+    pid_m = tl.program_id(axis=0)
+    pid_n = tl.program_id(axis=1)
+    k = tl.cdiv(K, BLOCK_SIZE_K)
+    offs_m = (pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)) % M
+    offs_n = (pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)) % N
+    offs_k = tl.arange(0, BLOCK_SIZE_K)
+    a_ptrs = a_ptr + offs_m[:, None] * K + offs_k[None, :]
+    b_ptrs = b_ptr + offs_n[None, :] * K + offs_k[:, None]
+    a_s_ptrs = a_s_ptr + offs_m * k
+    b_s_ptrs = b_s_ptr + (offs_n // BLOCK_SIZE_K) * k
+
+    accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
+    for i in range(k):
+        a = tl.load(a_ptrs, mask=offs_k[None, :] < K - i * BLOCK_SIZE_K, other=0.0)
+        b = tl.load(b_ptrs, mask=offs_k[:, None] < K - i * BLOCK_SIZE_K, other=0.0)
+        a_s = tl.load(a_s_ptrs)
+        b_s = tl.load(b_s_ptrs)
+        accumulator += tl.dot(a, b) * a_s[:, None] * b_s[None, :]
+        a_ptrs += BLOCK_SIZE_K
+        b_ptrs += BLOCK_SIZE_K
+        a_s_ptrs += 1
+        b_s_ptrs += 1
+    c = accumulator.to(c_ptr.dtype.element_ty)
+    offs_m = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
+    offs_n = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
+    c_ptrs = c_ptr + offs_m[:, None] * N + offs_n[None, :]
+    mask = (offs_m[:, None] < M) & (offs_n[None, :] < N)
+    tl.store(c_ptrs, c, mask=mask)
+
+
+def fp8_gemm(a: torch.Tensor, a_s: torch.Tensor, b: torch.Tensor, b_s: torch.Tensor):
+    assert a.is_contiguous() and b.is_contiguous()
+    assert a_s.is_contiguous() and b_s.is_contiguous()
    K = a.size(-1)
    M = a.numel() // K
    N = b.size(0)
    c = a.new_empty(*a.size()[:-1], N, dtype=torch.get_default_dtype())
-    kernel = fp8_gemm_kernel(N, K)
-    kernel(a.view(M, K), b, c.view(M, N), a_s.view(M, -1), b_s)
+    grid = lambda META: (triton.cdiv(M, META['BLOCK_SIZE_M']), triton.cdiv(N, META['BLOCK_SIZE_N']))
+    fp8_gemm_kernel[grid](a, b, c, a_s, b_s, M, N, K)
    return c
-
-
-@tilelang.jit(out_idx=[4], pass_configs=pass_configs)
-def fp8_index_kernel(h: int, d: int):
-    b = T.symbolic("b")
-    m = T.symbolic("m")
-    n = T.symbolic("n")
-
-    blk_n1 = 512
-    blk_n2 = 128
-
-    @T.prim_func
-    def fp8_index_kernel_(
-        q: T.Tensor[(b, m, h, d), FP8],
-        q_s: T.Tensor[(b, m, h), FP32],
-        k: T.Tensor[(b, n, d), FP8],
-        k_s: T.Tensor[(b, n), FP32],
-        o: T.Tensor[(b, m, n), FP32],
-    ) -> None:
-        with T.Kernel(b, m, T.ceildiv(n, blk_n1)) as (i_b, i_m, i1_n):
-            q_smem = T.alloc_shared((h, d), FP8)
-            T.copy(q[i_b, i_m, 0, 0], q_smem)
-
-            q_s_frag = T.alloc_fragment(h, FP32)
-            T.copy(q_s[i_b, i_m, 0], q_s_frag)
-
-            for i2_n in T.Pipelined(blk_n1 // blk_n2, num_stages=2):
-                k_smem = T.alloc_shared((blk_n2, d), FP8)
-                T.copy(k[i_b, i1_n * blk_n1 + i2_n * blk_n2, 0], k_smem)
-
-                k_s_frag = T.alloc_fragment(blk_n2, FP32)
-                T.copy(k_s[i_b, i1_n * blk_n1 + i2_n * blk_n2], k_s_frag)
-
-                logits = T.alloc_fragment((blk_n2, h), FP32)
-                T.gemm(
-                    k_smem,
-                    q_smem,
-                    logits,
-                    transpose_A=False,
-                    transpose_B=True,
-                    clear_accum=True,
-                )
-
-                for i_h, i3_n in T.Parallel(h, blk_n2):
-                    logits[i3_n, i_h] = T.max(logits[i3_n, i_h], 0) * q_s_frag[i_h]
-
-                logits_sum = T.alloc_fragment(blk_n2, FP32)
-                T.reduce_sum(logits, logits_sum, dim=1)
-
-                for i3_n in T.Parallel(blk_n2):
-                    logits_sum[i3_n] *= k_s_frag[i3_n]
-
-                T.copy(logits_sum, o[i_b, i_m, i1_n * blk_n1 + i2_n * blk_n2])
-
-    return fp8_index_kernel_
-
-
-def fp8_index(
-    q: torch.Tensor,
-    q_s: torch.Tensor,
-    k: torch.Tensor,
-    k_s: torch.Tensor,
-) -> torch.Tensor:
-    """
-    Perform index score using FP8 precision.
-
-    Args:
-        q (torch.Tensor): The Q tensor, must be contiguous.
-        q_s (torch.Tensor): The scaling factor for Q (float), must be contiguous.
-        k (torch.Tensor): The K tensor, must be contiguous.
-        k_s (torch.Tensor): The scaling factor for K (e8m0 here), must be contiguous.
-
-        fp8 q @ fp8 k -> fp32 logits
-        relu(fp32 logits) * q_s (weights) -> fp32 logits
-        fp32 logits -> fp32 logits_sum
-        fp32 logits_sum * k_s (e8m0) -> fp32 index_score
-    """
-    return fp8_index_kernel(q.shape[2], q.shape[3])(q, q_s, k, k_s)
--- a/inference/model.py
+++ b/inference/model.py
--- a/inference/requirements.txt
+++ b/inference/requirements.txt
-torch
-transformers
-safetensors
-fast_hadamard_transform
-tilelang==0.1.6
\ No newline at end of file
--- a/model.properties
+++ b/model.properties
@@ -3,7 +3,7 @@ modelCode=1761
 # 模型名称
 modelName=deepseek-v3.2-exp_pytorch
 # 模型描述
-modelDescription=DeepSeek-V3.2
+modelDescription=DeepSeek-V3.2-Exp模型是一个实验版本，作为迈向下一代架构的中间步骤。
 # 应用场景
 appScenario=推理,对话问答,制造,金融,教育,广媒
 # 框架类型

--- a/start_vllm.sh
+++ b/start_vllm.sh