restore the initial fp8 related implementation

remove medusa related files

restore the initial fp8 related implementation
remove medusa related files
98a011e9 · zhuwenwen · 80c483dd · 98a011e9 · 98a011e9 · 98a011e9
Commit 98a011e9 authored Dec 02, 2025 by zhuwenwen
17 changed files
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -266,15 +266,15 @@ set(VLLM_EXT_SRC
  "csrc/attention/attention_with_mask_kernels_opt.cu"
  "csrc/attention/attention_with_mask_kernels_opt_tc.cu"
  "csrc/opt/layernorm_kernels_opt.cu"
-  # "csrc/layernorm_quant_kernels.cu"
+  "csrc/layernorm_quant_kernels.cu"
  "csrc/sampler.cu"
  "csrc/cuda_view.cu"
  # "csrc/quantization/gptq/q_gemm.cu"
  "csrc/quantization/compressed_tensors/int8_quant_kernels.cu"
-  # "csrc/quantization/fp8/common.cu"
+  "csrc/quantization/fp8/common.cu"
  "csrc/quantization/fused_kernels/fused_layernorm_dynamic_per_token_quant.cu"
  "csrc/quantization/gguf/gguf_kernel.cu"
-  # "csrc/quantization/activation_kernels.cu"
+  "csrc/quantization/activation_kernels.cu"
  "csrc/cuda_utils_kernels.cu"
  "csrc/custom_all_reduce.cu"
  "csrc/torch_bindings.cpp")

--- a/cmake/utils.cmake
+++ b/cmake/utils.cmake
@@ -123,7 +123,7 @@ function (get_torch_gpu_compiler_flags OUT_GPU_FLAGS GPU_LANG)
 
    list(APPEND GPU_FLAGS
      "-DUSE_ROCM"
-      # "-DENABLE_FP8"
+      "-DENABLE_FP8"
      "-U__HIP_NO_HALF_CONVERSIONS__"
      "-U__HIP_NO_HALF_OPERATORS__"
      "-Werror=unused-variable"

--- a/csrc/layernorm_quant_kernels.cu
+++ b/csrc/layernorm_quant_kernels.cu
@@ -6,9 +6,7 @@
 */

 #include "type_convert.cuh"
-#ifndef USE_ROCM
 #include "quantization/fp8/common.cuh"
-#endif
 #include "dispatch_utils.h"
 #include "cub_helpers.h"


--- a/csrc/ops.h
+++ b/csrc/ops.h
@@ -224,15 +224,15 @@ void apply_repetition_penalties_(torch::Tensor& logits,
                                 const torch::Tensor& output_mask,
                                 const torch::Tensor& repetition_penalties);

-// void rms_norm_static_fp8_quant(torch::Tensor& out, torch::Tensor& input,
-//                                torch::Tensor& weight, torch::Tensor& scale,
-//                                double epsilon);
+void rms_norm_static_fp8_quant(torch::Tensor& out, torch::Tensor& input,
+                               torch::Tensor& weight, torch::Tensor& scale,
+                               double epsilon);

-// void fused_add_rms_norm_static_fp8_quant(torch::Tensor& out,
-//                                          torch::Tensor& input,
-//                                          torch::Tensor& residual,
-//                                          torch::Tensor& weight,
-//                                          torch::Tensor& scale, double epsilon);
+void fused_add_rms_norm_static_fp8_quant(torch::Tensor& out,
+                                         torch::Tensor& input,
+                                         torch::Tensor& residual,
+                                         torch::Tensor& weight,
+                                         torch::Tensor& scale, double epsilon);

 void rms_norm_dynamic_per_token_quant(torch::Tensor& out,
                                      torch::Tensor const& input,
@@ -248,8 +248,8 @@ void rotary_embedding(torch::Tensor& positions, torch::Tensor& query,

 void silu_and_mul(torch::Tensor& out, torch::Tensor& input);

-// void silu_and_mul_quant(torch::Tensor& out, torch::Tensor& input,
-//                         torch::Tensor& scale);
+void silu_and_mul_quant(torch::Tensor& out, torch::Tensor& input,
+                        torch::Tensor& scale);

 #ifndef USE_ROCM
 void silu_and_mul_nvfp4_quant(torch::Tensor& out,
@@ -257,12 +257,12 @@ void silu_and_mul_nvfp4_quant(torch::Tensor& out,
                              torch::Tensor& input,
                              torch::Tensor& input_global_scale);
 #endif
-// void silu_mul_fp8_quant_deep_gemm_cuda(
-//     const at::Tensor& input,   // (E, T, 2*H)
-//     const at::Tensor& counts,  // (E)
-//     at::Tensor& y_q,           // (E, T, H) [OUT]
-//     at::Tensor& y_s,           // (E, T, H//group_size) [OUT]
-//     int64_t group_size, bool use_ue8m0, int64_t num_parallel_tokens);
+void silu_mul_fp8_quant_deep_gemm_cuda(
+    const at::Tensor& input,   // (E, T, 2*H)
+    const at::Tensor& counts,  // (E)
+    at::Tensor& y_q,           // (E, T, H) [OUT]
+    at::Tensor& y_s,           // (E, T, H//group_size) [OUT]
+    int64_t group_size, bool use_ue8m0, int64_t num_parallel_tokens);

 void mul_and_silu(torch::Tensor& out, torch::Tensor& input);

@@ -438,15 +438,15 @@ void dynamic_scaled_int8_quant(torch::Tensor& out, torch::Tensor const& input,

 // void gptq_shuffle(torch::Tensor q_weight, torch::Tensor q_perm, int64_t bit);

-// void static_scaled_fp8_quant(torch::Tensor& out, torch::Tensor const& input,
-//                              torch::Tensor const& scale);
+void static_scaled_fp8_quant(torch::Tensor& out, torch::Tensor const& input,
+                             torch::Tensor const& scale);

-// void dynamic_scaled_fp8_quant(torch::Tensor& out, torch::Tensor const& input,
-//                               torch::Tensor& scale);
+void dynamic_scaled_fp8_quant(torch::Tensor& out, torch::Tensor const& input,
+                              torch::Tensor& scale);

-// void dynamic_per_token_scaled_fp8_quant(
-//     torch::Tensor& out, torch::Tensor const& input, torch::Tensor& scale,
-//     std::optional<torch::Tensor> const& scale_ub);
+void dynamic_per_token_scaled_fp8_quant(
+    torch::Tensor& out, torch::Tensor const& input, torch::Tensor& scale,
+    std::optional<torch::Tensor> const& scale_ub);

 void selective_scan_fwd(const torch::Tensor& u, const torch::Tensor& delta,
                        const torch::Tensor& A, const torch::Tensor& B,

--- a/csrc/quantization/fused_kernels/quant_conversions.cuh
+++ b/csrc/quantization/fused_kernels/quant_conversions.cuh
@@ -6,7 +6,7 @@

 #include "quantization/vectorization.cuh"
 // TODO(luka/varun):refactor common.cuh to use this file instead
-// #include "quantization/fp8/common.cuh"
+#include "quantization/fp8/common.cuh"

 namespace vllm {


--- a/csrc/torch_bindings.cpp
+++ b/csrc/torch_bindings.cpp
@@ -32,12 +32,12 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
  #define stride_tag
 #endif

-//   ops.def(
-//       "silu_mul_fp8_quant_deep_gemm_cuda(Tensor input, Tensor counts, Tensor! "
-//       "y_q, Tensor! y_s, int group_size, "
-//       "bool use_ue8m0, int num_parallel_tokens) -> ()");
-//   ops.impl("silu_mul_fp8_quant_deep_gemm_cuda", torch::kCUDA,
-//            &silu_mul_fp8_quant_deep_gemm_cuda);
+  ops.def(
+      "silu_mul_fp8_quant_deep_gemm_cuda(Tensor input, Tensor counts, Tensor! "
+      "y_q, Tensor! y_s, int group_size, "
+      "bool use_ue8m0, int num_parallel_tokens) -> ()");
+  ops.impl("silu_mul_fp8_quant_deep_gemm_cuda", torch::kCUDA,
+           &silu_mul_fp8_quant_deep_gemm_cuda);

  ops.def("weak_ref_tensor(Tensor input) -> Tensor");
  ops.impl("weak_ref_tensor", torch::kCUDA, &weak_ref_tensor);
@@ -269,9 +269,9 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
  ops.def("silu_and_mul(Tensor! result, Tensor input) -> ()");
  ops.impl("silu_and_mul", torch::kCUDA, &silu_and_mul);

-//   ops.def(
-//       "silu_and_mul_quant(Tensor! result, Tensor input, Tensor scale) -> ()");
-//   ops.impl("silu_and_mul_quant", torch::kCUDA, &silu_and_mul_quant);
+  ops.def(
+      "silu_and_mul_quant(Tensor! result, Tensor input, Tensor scale) -> ()");
+  ops.impl("silu_and_mul_quant", torch::kCUDA, &silu_and_mul_quant);

 #ifndef USE_ROCM
  ops.def(
@@ -366,20 +366,20 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {

  // Layernorm-quant
  // Apply Root Mean Square (RMS) Normalization to the input tensor.
-//   ops.def(
-//       "rms_norm_static_fp8_quant(Tensor! result, Tensor input, Tensor weight, "
-//       "Tensor scale, float epsilon) -> "
-//       "()");
-//   ops.impl("rms_norm_static_fp8_quant", torch::kCUDA,
-//            &rms_norm_static_fp8_quant);
+  ops.def(
+      "rms_norm_static_fp8_quant(Tensor! result, Tensor input, Tensor weight, "
+      "Tensor scale, float epsilon) -> "
+      "()");
+  ops.impl("rms_norm_static_fp8_quant", torch::kCUDA,
+           &rms_norm_static_fp8_quant);

  // In-place fused Add and RMS Normalization.
-//   ops.def(
-//       "fused_add_rms_norm_static_fp8_quant(Tensor! result, Tensor input, "
-//       "Tensor! residual, Tensor weight, "
-//       "Tensor scale, float epsilon) -> ()");
-//   ops.impl("fused_add_rms_norm_static_fp8_quant", torch::kCUDA,
-//            &fused_add_rms_norm_static_fp8_quant);
+  ops.def(
+      "fused_add_rms_norm_static_fp8_quant(Tensor! result, Tensor input, "
+      "Tensor! residual, Tensor weight, "
+      "Tensor scale, float epsilon) -> ()");
+  ops.impl("fused_add_rms_norm_static_fp8_quant", torch::kCUDA,
+           &fused_add_rms_norm_static_fp8_quant);

  // Fused Layernorm + Quant kernels
  ops.def(
@@ -741,25 +741,25 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
 //   ops.impl("gptq_shuffle", torch::kCUDA, &gptq_shuffle);

  // Compute FP8 quantized tensor for given scaling factor.
-//   ops.def(
-//       "static_scaled_fp8_quant(Tensor! result, Tensor input, Tensor scale) -> "
-//       "()");
-//   ops.impl("static_scaled_fp8_quant", torch::kCUDA, &static_scaled_fp8_quant);
+  ops.def(
+      "static_scaled_fp8_quant(Tensor! result, Tensor input, Tensor scale) -> "
+      "()");
+  ops.impl("static_scaled_fp8_quant", torch::kCUDA, &static_scaled_fp8_quant);

-//   // Compute dynamic-per-tensor FP8 quantized tensor and scaling factor.
-//   ops.def(
-//       "dynamic_scaled_fp8_quant(Tensor! result, Tensor input, Tensor! scale) "
-//       "-> "
-//       "()");
-//   ops.impl("dynamic_scaled_fp8_quant", torch::kCUDA, &dynamic_scaled_fp8_quant);
+  // Compute dynamic-per-tensor FP8 quantized tensor and scaling factor.
+  ops.def(
+      "dynamic_scaled_fp8_quant(Tensor! result, Tensor input, Tensor! scale) "
+      "-> "
+      "()");
+  ops.impl("dynamic_scaled_fp8_quant", torch::kCUDA, &dynamic_scaled_fp8_quant);

-//   // Compute dynamic-per-token FP8 quantized tensor and scaling factor.
-//   ops.def(
-//       "dynamic_per_token_scaled_fp8_quant(Tensor! result, Tensor input, "
-//       "Tensor! scale, Tensor? scale_ub) -> "
-//       "()");
-//   ops.impl("dynamic_per_token_scaled_fp8_quant", torch::kCUDA,
-//            &dynamic_per_token_scaled_fp8_quant);
+  // Compute dynamic-per-token FP8 quantized tensor and scaling factor.
+  ops.def(
+      "dynamic_per_token_scaled_fp8_quant(Tensor! result, Tensor input, "
+      "Tensor! scale, Tensor? scale_ub) -> "
+      "()");
+  ops.impl("dynamic_per_token_scaled_fp8_quant", torch::kCUDA,
+           &dynamic_per_token_scaled_fp8_quant);

  // Compute int8 quantized tensor for given scaling factor.
  ops.def(

--- a/examples/medusa/README.md
+++ b/examples/medusa/README.md
-# Medusa Decoding
-本文说明如何使用vllm构建和运行medusa模型
-
-## Overview
-Medusa是一种大模型并行解码算法，除了支持官方提供的Top1-proposer,我们还支持tree-style并行解码，target model和draft model均可多卡推理
-
-与其他模型不同，medusa解码需要一个base model和若干Medusa heads.
-
-Vllm medusa model的实现在[vllm/model_executor/models/medusa.py]
-
-## Support Matrix
-  * FP16
-  * BF16
-  * PAGED_KV_CACHE
-  * Tensor Parallel
-
-### convert Medusa model weights
-# medusa 模型需要转换为vllm中Medusa的模型格式
-
-```bash
-python medusa_weight_converter.py --medusa_num_heads 4 --medusa_num_layers 1 --medusa_model_path /work/model.bin --vocab_size 152064 --hidden_size 8192 --output_dir /work/medusa/vllm-medusa-qwen2-72b-head-4 --medusa_choices="[(0), (0, 0), (0, 0, 0), (0, 1), (1), (1, 0), (0, 0, 0, 0), (0, 0, 1), (0, 2), (0, 1, 0), (2), (0, 0, 2), (0, 3), (1, 0, 0), (2, 0), (0, 2, 0), (0, 4), (0, 0, 3), (3), (0, 0, 0, 1), (0, 5), (0, 0, 1, 0), (0, 0, 4)]"
-```
-此处model.bin是训练后保存的medusa head权重，如果希望采用Top1-proposer，medusa_choices可以不设置
-
-
-### Run tree-style generation server
-
-```bash
-VLLM_TREE_DECODING=1 python3 -m vllm.entrypoints.openai.api_server \
-  --served-model-name qwen_medusa \
-  --model /models/Qwen2-72B-Instruct/ -tp 4 \
-  --max-model-len 1024 --max-num-seqs 8 --gpu-memory-utilization 0.8 \
-  --speculative-model /work/medusa/vllm-medusa-qwen2-72b-head-4 \
-  --speculative-draft-tensor-parallel-size 4 \
-  --speculative-disable-by-batch-size 9 \
-  --use-v2-block-manager \
-  --spec-decoding-acceptance-method typical_acceptance_sampler \
-  --dtype float16 --trust-remote-code --port 8086\
-  --num-speculative-heads 4 --num-speculative-tokens 24
-```
-注意：
-num_speculative_tokens = len(medusa_choices) + 1
-medusa_choices个数不能太多，否则多batch下会降低推理速度
-speculative-disable-by-batch-size要大于max-num-seqs，否则当batch等于max-num-seqs时，不会走并行解码
-
-### Run Top1-proposer server
-python3 -m vllm.entrypoints.openai.api_server \
-  --served-model-name qwen_medusa \
-  --model /models/Qwen2-72B-Instruct/ -tp 4 \
-  --max-model-len 1024 --max-num-seqs 8 --gpu-memory-utilization 0.8 \
-  --speculative-model /work/medusa/vllm-medusa-qwen2-72b-head-4 \
-  --speculative-draft-tensor-parallel-size 4 \
-  --speculative-disable-by-batch-size 9 \
-  --use-v2-block-manager \
-  --spec-decoding-acceptance-method typical_acceptance_sampler \
-  --dtype float16 --trust-remote-code --port 8086\
-  --num-speculative-tokens 4
-注意:
-使用Top1-proposer时，num-speculative-tokens就是medusa head的个数
-
-# do request
-```bash
-curl http://localhost:8086/v1/completions \
-H "Content-Type: application/json" \
-d '{
-"model": "qwen_medusa",
-"prompt": "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n帮我写一个C++的快速排序算法<|im_end|>\n<|im_start|>assistant\n",
-"max_tokens": 256,
-"temperature": 0.0
-}'
-```
-
-### Run tree-style benchmark
-```bash
-VLLM_TREE_DECODING=1 python /work/test/medusa_benchmark_throughput.py --model /models/Qwen2-72B-Instruct/ -tp 4 --dtype float16 --trust-remote-code --max-num-seqs 4 --speculative-model /work/medusa/vllm-medusa1-qwen2-72b-head-4 --speculative-draft-tensor-parallel-size 4 --speculative-disable-by-batch-size 9 --use-v2-block-manager --spec-decoding-acceptance-method typical_acceptance_sampler --max-model-len 1024 --dataset /work/medusa_benchmark_data.json --num-speculative-heads 4 --num-speculative-tokens 24 --gpu-memory-utilization 0.95
-```
-
-### Run Top1-proposer benchmark
-```bash
-python /work/test/medusa_benchmark_throughput.py --model /models/Qwen2-72B-Instruct/ -tp 4 --dtype float16 --trust-remote-code --max-num-seqs 4 --speculative-model /work/medusa/vllm-medusa1-qwen2-72b-head-4 --speculative-draft-tensor-parallel-size 4 --speculative-disable-by-batch-size 9 --use-v2-block-manager --spec-decoding-acceptance-method typical_acceptance_sampler --max-model-len 1024 --dataset /work/medusa_benchmark_data.json --num-speculative-tokens 4 --gpu-memory-utilization 0.95
-```
-可设置max-num-seqs对不同的batch进行性能测试
-
--- a/examples/medusa/medusa_benchmark_throughput.py
+++ b/examples/medusa/medusa_benchmark_throughput.py
--- a/examples/medusa/medusa_weight_converter.py
+++ b/examples/medusa/medusa_weight_converter.py
-import os
-import ast
-from pathlib import Path
-from typing import Iterable, List, Optional, Tuple, Union
-from addict import Dict
-import yaml
-import argparse
-
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-from torch.nn.parameter import Parameter
-from transformers import PretrainedConfig
-from safetensors.torch import save_model, safe_open
-
-from vllm.model_executor.layers.linear import UnquantizedLinearMethod
-from vllm.model_executor.layers.quantization.base_config import (
-    QuantizationConfig, QuantizeMethodBase)
-from vllm.model_executor.utils import set_weight_attrs
-
-DEFAULT_VOCAB_PADDING_SIZE = 64
-
-TRAINED_BLOCK_WEIGHT_NAME_TEMPLATE = 'medusa_head.{}.{}.linear.weight'
-TRAINED_MEDUSA_HEADS_NEMA_TEMPLATE = 'medusa_head.{}.1.weight'
-TRAINED_BLOCK_BIAS_NAME_TEMPLATE = 'medusa_head.{}.{}.linear.bias'
-
-VLLM_BLOCK_WEIGHT_NAME_TEMPLATE = 'blocks.{}.layers.{}.weight'
-VLLM_BLOCK_BIAS_NAME_TEMPLATE = 'blocks.{}.layers.{}.bias'
-VLLM_MEDUSA_HEADS_WEIGHT_NAME_TEMPLATE = 'lm_heads.{}.weight'
-
-
-def default_weight_loader(param: torch.Tensor,
-                          loaded_weight: torch.Tensor) -> None:
-    """Default weight loader."""
-    assert param.size() == loaded_weight.size()
-    param.data.copy_(loaded_weight)
-
-def pad_vocab_size(vocab_size: int,
-                   pad_to: int = DEFAULT_VOCAB_PADDING_SIZE) -> int:
-    """Pad the vocab size to the given value."""
-    return ((vocab_size + pad_to - 1) // pad_to) * pad_to
-
-class MedusaConfig(PretrainedConfig):
-    model_type = "medusa"
-
-    def __init__(self,
-                 hidden_size: int = 4096,
-                 vocab_size: int = 32001,
-                 num_heads: int = 5,
-                 num_hidden_layers: int = 1,
-                 max_paths: int = 64,
-                 topk: int = 10,
-                 truncated_vocab_size: Optional[int] = None,
-                 **kwargs):
-
-        self.hidden_size = hidden_size
-        self.vocab_size = vocab_size
-        self.num_heads = num_heads
-        self.num_hidden_layers = num_hidden_layers
-        self.max_paths = max_paths
-        self.topk = topk
-        self.max_seq_len = int(2**20)
-        self.truncated_vocab_size = vocab_size if truncated_vocab_size is None\
-            else truncated_vocab_size
-        if "architectures" not in kwargs:
-            kwargs["architectures"] = ["MedusaModel"]
-
-        super().__init__(**kwargs)
-
-    @property
-    def num_attention_heads(self):
-        return 0
-
-    @property
-    def num_lookahead_tokens(self):
-        return self.num_heads
-
-    @num_lookahead_tokens.setter
-    def num_lookahead_tokens(self, num_lookahead_tokens: int):
-        self.num_heads = num_lookahead_tokens
-
-class VocabParallelEmbedding(torch.nn.Module):
-    """Embedding parallelized in the vocabulary dimension.
-
-    Adapted from torch.nn.Embedding, note that we pad the vocabulary size to
-    make sure it is divisible by the number of model parallel GPUs.
-
-    In order to support various loading methods, we ensure that LoRA-added
-    embeddings are always at the end of TP-sharded tensors. In other words,
-    we shard base embeddings and LoRA embeddings separately (both padded),
-    and place them in the same tensor.
-    In this example, we will have the original vocab size = 1010,
-    added vocab size = 16 and padding to 64. Therefore, the total
-    vocab size with padding will be 1088 (because we first pad 1010 to
-    1024, add 16, and then pad to 1088).
-    Therefore, the tensor format looks like the following:
-    TP1, rank 0 (no sharding):
-                            |< --------BASE-------- >|< -BASE PADDING-- >|< -----LORA------ >|< -LORA PADDING-- >|
-    corresponding token_id: |  0  |  1  | ... | 1009 |  -1  | ... |  -1  | 1010 | ... | 1015 |  -1  | ... |  -1  |
-                     index: |  0  |  1  | ... | 1009 | 1010 | ... | 1023 | 1024 | ... | 1039 | 1040 | ... | 1087 |
-
-    TP2, rank 0:
-                            |< --------------------BASE--------------------- >|< -----LORA------ >|< -LORA PADDING- >|
-    corresponding token_id: |  0  |  1  |  2  | ... | 497  | 498 | ...  | 511 | 1000 | ... | 1015 |  -1  | ... |  -1 |
-                     index: |  0  |  1  |  2  | ... | 497  | 498 | ...  | 511 | 512  | ... | 527  |  520 | ... | 543 |
-    TP2, rank 1:
-                            |< -----------BASE----------- >|< -BASE PADDING- >|< -----------LORA PADDING----------- >|
-    corresponding token_id: | 512 | 513 | 514 | ... | 1009 | -1  | ...  | -1  |  -1  | ... |  -1  | -1  | ... |   -1 |
-                     index: |  0  |  1  |  2  | ... | 497  | 498 | ...  | 511 | 512  | ... | 519  | 520 | ... |  543 |
-
-    Args:
-        num_embeddings: vocabulary size.
-        embedding_dim: size of hidden state.
-        params_dtype: type of the parameters.
-        org_num_embeddings: original vocabulary size (without LoRA).
-        padding_size: padding size for the vocabulary.
-        quant_config: quant config for the layer
-        prefix: full name of the layer in the state dict
-    """  # noqa: E501
-
-    def __init__(self,
-                 num_embeddings: int,
-                 embedding_dim: int,
-                 params_dtype: Optional[torch.dtype] = None,
-                 org_num_embeddings: Optional[int] = None,
-                 padding_size: int = DEFAULT_VOCAB_PADDING_SIZE,
-                 quant_config: Optional[QuantizationConfig] = None,
-                 prefix: str = ""):
-        super().__init__()
-
-        self.num_embeddings = num_embeddings
-        self.padding_size = padding_size
-        self.org_vocab_size = org_num_embeddings or num_embeddings
-        num_added_embeddings = num_embeddings - self.org_vocab_size
-        self.org_vocab_size_padded = pad_vocab_size(self.org_vocab_size,
-                                                    self.padding_size)
-        self.num_embeddings_padded = pad_vocab_size(
-            self.org_vocab_size_padded + num_added_embeddings,
-            self.padding_size)
-        assert self.org_vocab_size_padded <= self.num_embeddings_padded
-
-        self.embedding_dim = embedding_dim
-
-        linear_method = None
-        if quant_config is not None:
-            linear_method = quant_config.get_quant_method(self, prefix=prefix)
-        if linear_method is None:
-            linear_method = UnquantizedLinearMethod()
-        self.linear_method: QuantizeMethodBase = linear_method
-
-        if params_dtype is None:
-            params_dtype = torch.get_default_dtype()
-
-        self.linear_method.create_weights(self,
-                                          self.embedding_dim,
-                                          [self.num_embeddings_padded],
-                                          self.embedding_dim,
-                                          self.num_embeddings_padded,
-                                          params_dtype=params_dtype,
-                                          weight_loader=self.weight_loader)
-
-    def weight_loader(self, param: Parameter, loaded_weight: torch.Tensor):
-        assert param.data.shape == loaded_weight.shape
-        param.data.copy_(loaded_weight)
-
-    def forward(self, input_):
-        masked_input = input_
-        # Get the embeddings.
-        output = F.embedding(masked_input.long(), self.weight)
-        return output
-
-class ParallelLMHead(VocabParallelEmbedding):
-    """Parallelized LM head.
-
-    Output logits weight matrices used in the Sampler. The weight and bias
-    tensors are padded to make sure they are divisible by the number of
-    model parallel GPUs.
-
-    Args:
-        num_embeddings: vocabulary size.
-        embedding_dim: size of hidden state.
-        bias: whether to use bias.
-        params_dtype: type of the parameters.
-        org_num_embeddings: original vocabulary size (without LoRA).
-        padding_size: padding size for the vocabulary.
-    """
-
-    def __init__(self,
-                 num_embeddings: int,
-                 embedding_dim: int,
-                 bias: bool = False,
-                 params_dtype: Optional[torch.dtype] = None,
-                 org_num_embeddings: Optional[int] = None,
-                 padding_size: int = DEFAULT_VOCAB_PADDING_SIZE,
-                 quant_config: Optional[QuantizationConfig] = None,
-                 prefix: str = ""):
-        super().__init__(num_embeddings, embedding_dim, params_dtype,
-                         org_num_embeddings, padding_size, quant_config,
-                         prefix)
-        if bias:
-            self.bias = Parameter(
-                torch.empty(self.num_embeddings_per_partition,
-                            dtype=params_dtype))
-            set_weight_attrs(self.bias, {
-                "output_dim": 0,
-                "weight_loader": self.weight_loader,
-            })
-        else:
-            self.register_parameter("bias", None)
-
-    def forward(self, input_):
-        del input_
-        raise RuntimeError("LMHead's weights should be used in the sampler.")
-
-
-class ResidualBlock(nn.Module):
-
-    def __init__(self, hidden_size: int, num_layers: int) -> None:
-        super().__init__()
-
-        self.layers = nn.ModuleList([
-            nn.Linear(hidden_size, hidden_size)
-            for _ in range(num_layers)
-        ])
-        self.act = nn.SiLU()
-
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        for layer in self.layers:
-            x = x + self.act(layer(x))
-        return x
-
-class Medusa(nn.Module):
-
-    def __init__(self, config: MedusaConfig, **_) -> None:
-        super().__init__()
-        self.config = config
-        self.blocks = nn.ModuleList([
-            ResidualBlock(hidden_size=self.config.hidden_size,
-                          num_layers=self.config.num_hidden_layers)
-            for _ in range(self.config.num_heads)
-        ])
-        self.orig_vocab_size = config.vocab_size
-        self.truncated_vocab_size = config.truncated_vocab_size
-        self.unpadded_vocab_size = self.truncated_vocab_size
-
-        self.lm_heads = nn.ModuleList([
-            ParallelLMHead(
-                self.unpadded_vocab_size,
-                config.hidden_size,
-                org_num_embeddings=self.truncated_vocab_size,
-                padding_size=DEFAULT_VOCAB_PADDING_SIZE,
-            ) for _ in range(self.config.num_heads)
-        ]) 
-
-        logit_scale = getattr(config, "logit_scale", 1.0)
-
-        self.token_map = None
-
-    def forward(self, hidden_states: torch.Tensor) -> List[torch.Tensor]:
-        return [block(hidden_states) for block in self.blocks]
-
-    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
-        params_dict = dict(self.named_parameters())
-
-        weights_map = {}
-
-        for name, loaded_weight in weights:
-            name = name.replace("medusa_heads.", "")
-
-            if name == "token_map":
-                if self.truncated_vocab_size < self.orig_vocab_size:
-                    self.token_map = nn.Parameter(loaded_weight,
-                                                  requires_grad=False)
-            elif name in params_dict:
-                weights_map[name] = loaded_weight
-
-        for name, loaded_weight in weights_map.items():
-            if "lm_head" in name and self.token_map is not None and\
-                loaded_weight.shape[0] > self.token_map.shape[0]:
-
-                loaded_weight = loaded_weight[self.token_map]
-
-            param = params_dict[name]
-            weight_loader = getattr(param, "weight_loader",
-                                    default_weight_loader)
-            weight_loader(param, loaded_weight)
-
-        if self.token_map is not None:
-            self.token_map.to(device=self.lm_heads[0].weight.device)
-
-        assert (self.truncated_vocab_size
-                == self.orig_vocab_size) or (self.token_map is not None)
-
-class CustomMedusaConfig(PretrainedConfig):
-    model_type = "medusa"
-
-    def __init__(self,
-                 name_or_path: str = "S-3000/vllm-medusa-qwen1.5-7b-chat",
-                 architectures: list[str] = ["MedusaModel"],
-                 hidden_size: int = 4096,
-                 model_type: str = "medusa",
-                 num_heads: int = 5,
-                 num_hidden_layers: int = 1,
-                 transformers_version: str = "4.41.2",
-                 truncated_vocab_size: Optional[int] = None,
-                 vocab_size: int = 151936,
-                 medusa_choices:List[List[int]] = None,
-                 **kwargs):
-        super().__init__(**kwargs)
-        self._name_or_path = name_or_path
-        self.architectures = architectures
-        self.hidden_size = hidden_size
-        self.model_type = model_type
-        self.num_heads = num_heads
-        self.num_hidden_layers = num_hidden_layers
-        self.transformers_version = transformers_version
-        self.truncated_vocab_size = truncated_vocab_size
-        self.vocab_size = vocab_size
-        self.medusa_choices = medusa_choices
-
-
-def main(args):
-    medusa_head_num = args.medusa_num_heads
-    medusa_num_layers = args.medusa_num_layers
-
-    config = MedusaConfig(hidden_size=args.hidden_size, vocab_size=args.vocab_size, num_heads=medusa_head_num)
-    medusa_model = Medusa(config)
-
-    params_dict = dict(medusa_model.named_parameters())
-
-    trained_medusa_model = torch.load(args.medusa_model_path)
-
-    for i in range(medusa_head_num):
-        vllm_medusa_head_weight_name = VLLM_MEDUSA_HEADS_WEIGHT_NAME_TEMPLATE.format(i)
-        trained_medusa_head_weight_name = TRAINED_MEDUSA_HEADS_NEMA_TEMPLATE.format(i)
-
-        vllm_medusa_head_param = params_dict[vllm_medusa_head_weight_name]
-        trained_medusa_head_param = trained_medusa_model[trained_medusa_head_weight_name]
-        weight_loader = getattr(vllm_medusa_head_param, "weight_loader",
-                                    default_weight_loader)
-        weight_loader(vllm_medusa_head_param, trained_medusa_head_param)
-
-    for i in range(medusa_head_num):
-        for j in range(medusa_num_layers):
-            # load linear weight
-            vllm_medusa_block_weight_name = VLLM_BLOCK_WEIGHT_NAME_TEMPLATE.format(i, j)
-            trained_medusa_block_weight_name = TRAINED_BLOCK_WEIGHT_NAME_TEMPLATE.format(i, j)
-
-            vllm_medusa_block_param = params_dict[vllm_medusa_block_weight_name]
-            trained_medusa_block_param = trained_medusa_model[trained_medusa_block_weight_name]
-
-            weight_loader = getattr(vllm_medusa_block_param, "weight_loader",
-                                    default_weight_loader)
-            weight_loader(vllm_medusa_block_param, trained_medusa_block_param)
-
-            # load linear bias
-            vllm_medusa_block_bias_name = VLLM_BLOCK_BIAS_NAME_TEMPLATE.format(i, j)
-            trained_medusa_block_bias_name = TRAINED_BLOCK_BIAS_NAME_TEMPLATE.format(i, j)
-
-            vllm_medusa_block_bias_param = params_dict[vllm_medusa_block_bias_name]
-            trained_medusa_block_bias_param = trained_medusa_model[trained_medusa_block_bias_name]
-
-            weight_loader = getattr(vllm_medusa_block_bias_param, "weight_loader",
-                                    default_weight_loader)
-            weight_loader(vllm_medusa_block_bias_param, trained_medusa_block_bias_param)
-    
-
-    if not Path(args.output_dir).is_dir():
-        os.makedirs(args.output_dir, exist_ok=True)
-    save_model(medusa_model, os.path.join(args.output_dir, "model.safetensors"))
-    
-    medusa_choices = ast.literal_eval(args.medusa_choices) if args.medusa_choices is not None else None
-    to_save_config = CustomMedusaConfig(name_or_path=os.path.join(args.output_dir, "config.json"),
-                                        hidden_size=args.hidden_size,
-                                        num_heads=medusa_head_num,
-                                        num_hidden_layers=medusa_num_layers,
-                                        vocab_size=args.vocab_size,
-                                        medusa_choices=medusa_choices)
-    to_save_config.save_pretrained(args.output_dir)
-
-    # validate weight
-    # with safe_open(os.path.join(args.output_dir, "model.safetensors"), framework="pt") as f:
-    #     param = f.get_tensor(VLLM_BLOCK_WEIGHT_NAME_TEMPLATE.format(3, 0))
-    #     trained_param = trained_medusa_model[TRAINED_BLOCK_WEIGHT_NAME_TEMPLATE.format(3, 0)]
-    #     mse_value = torch.nn.functional.mse_loss(param.cpu(), trained_param.cpu())
-    #     print("weight mes:", mse_value)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser(description="Medusa Model Evaluator")
-    parser.add_argument("--medusa_model_path", type=str, required=True,
-                        help="Path to the medusa model file.")
-    parser.add_argument("--vocab_size", type=int, required=True,
-                        help="Vocab size")
-    parser.add_argument("--medusa_num_heads", type=int, required=True,
-                        help="Number of Medusa heads")
-    parser.add_argument("--medusa_num_layers", type=int, required=True,
-                        help="Number of Medusa layers")
-    parser.add_argument("--hidden_size", type=int, required=True,
-                        help="Hidden size")
-    parser.add_argument("--output_dir", type=str, required=True,
-                        help="Output dir")
-    parser.add_argument(
-        '--medusa_choices',
-        type=str,
-        default=None,
-        help="Medusa choice to use, if not none, will use Medusa decoding."
-        "   E.g.: [[0, 0, 0, 0], [0, 1, 0], [1, 0], [1, 1]] for 9 medusa tokens."
-    )
-    args = parser.parse_args()
-    main(args)
--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
@@ -901,20 +901,20 @@ def gptq_shuffle(q_weight: torch.Tensor, q_perm: torch.Tensor,
 #         return torch.empty_like(b, memory_format=torch.contiguous_format)


-# if hasattr(torch.ops._C, "allspark_w8a16_gemm"):
-
-#     @register_fake("_C::allspark_w8a16_gemm")
-#     def _allspark_w8a16_gemm_fake(a: torch.Tensor, b_qweight: torch.Tensor,
-#                                   b_scales: torch.Tensor,
-#                                   b_qzeros: Optional[torch.Tensor],
-#                                   n: torch.SymInt, group_size: torch.SymInt,
-#                                   sm_count: torch.SymInt,
-#                                   sm_version: torch.SymInt,
-#                                   CUBLAS_M_THRESHOLD: torch.SymInt,
-#                                   has_zp: bool,
-#                                   n32k16_reorder: bool) -> torch.Tensor:
-#         m = a.size(0)
-#         return torch.empty((m, n), device=a.device, dtype=a.dtype)
+if hasattr(torch.ops._C, "allspark_w8a16_gemm"):
+
+    @register_fake("_C::allspark_w8a16_gemm")
+    def _allspark_w8a16_gemm_fake(a: torch.Tensor, b_qweight: torch.Tensor,
+                                  b_scales: torch.Tensor,
+                                  b_qzeros: Optional[torch.Tensor],
+                                  n: torch.SymInt, group_size: torch.SymInt,
+                                  sm_count: torch.SymInt,
+                                  sm_version: torch.SymInt,
+                                  CUBLAS_M_THRESHOLD: torch.SymInt,
+                                  has_zp: bool,
+                                  n32k16_reorder: bool) -> torch.Tensor:
+        m = a.size(0)
+        return torch.empty((m, n), device=a.device, dtype=a.dtype)

 if hasattr(torch.ops._C, "ggml_dequantize"):

@@ -1664,67 +1664,66 @@ def scaled_fp4_experts_quant(
    return output, output_scales


-# fp8
-# def scaled_fp8_quant(
-#     input: torch.Tensor,
-#     scale: Optional[torch.Tensor] = None,
-#     num_token_padding: Optional[int] = None,
-#     scale_ub: Optional[torch.Tensor] = None,
-#     use_per_token_if_dynamic: bool = False,
-#     output: Optional[torch.Tensor] = None,
-# ) -> tuple[torch.Tensor, torch.Tensor]:
-#     """
-#     Quantize input tensor to FP8 and return quantized tensor and scale.
-
-#     This function supports both static and dynamic quantization: If you
-#     provide the scale, it will use static scaling and if you omit it,
-#     the scale will be determined dynamically. The function also allows
-#     optional padding of the output tensors for downstream kernels that
-#     will benefit from padding.
-
-#     Args:
-#         input: The input tensor to be quantized to FP8
-#         scale: Optional scaling factor for the FP8 quantization
-#         scale_ub: Optional upper bound for scaling factor in dynamic
-#             per token case
-#         num_token_padding: If specified, pad the first dimension
-#             of the output to at least this value.
-#         use_per_token_if_dynamic: Whether to do per_tensor or per_token
-#             in the dynamic quantization case.
-
-#     Returns:
-#         tuple[torch.Tensor, torch.Tensor]: The output tensor in FP8 and
-#             scaling factor.
-#     """
-#     # This code assumes batch_dim and num_tokens are flattened
-#     assert (input.ndim == 2)
-#     shape: Union[tuple[int, int], torch.Size] = input.shape
-#     # For ROCm on MI300, the output fp8 dtype is torch.float_e3m3fnuz
-#     out_dtype: torch.dtype = current_platform.fp8_dtype()
-#     if num_token_padding:
-#         shape = (max(num_token_padding, input.shape[0]), shape[1])
-#     if output is None:
-#         output = torch.empty(shape, device=input.device, dtype=out_dtype)
-#     else:
-#         assert num_token_padding is None, \
-#             "padding not supported if output passed in"
-#         assert output.dtype == out_dtype
-
-#     if scale is None:
-#         if use_per_token_if_dynamic:
-#             scale = torch.empty((shape[0], 1),
-#                                 device=input.device,
-#                                 dtype=torch.float32)
-#             torch.ops._C.dynamic_per_token_scaled_fp8_quant(
-#                 output, input, scale, scale_ub)
-#         else:
-#             scale = torch.empty(1, device=input.device, dtype=torch.float32)
-#             torch.ops._C.dynamic_scaled_fp8_quant(output, input, scale)
-#     else:
-#         assert scale.numel() == 1, f"{scale.shape}"
-#         torch.ops._C.static_scaled_fp8_quant(output, input, scale)
-
-#     return output, scale
+def scaled_fp8_quant(
+    input: torch.Tensor,
+    scale: Optional[torch.Tensor] = None,
+    num_token_padding: Optional[int] = None,
+    scale_ub: Optional[torch.Tensor] = None,
+    use_per_token_if_dynamic: bool = False,
+    output: Optional[torch.Tensor] = None,
+) -> tuple[torch.Tensor, torch.Tensor]:
+    """
+    Quantize input tensor to FP8 and return quantized tensor and scale.
+
+    This function supports both static and dynamic quantization: If you
+    provide the scale, it will use static scaling and if you omit it,
+    the scale will be determined dynamically. The function also allows
+    optional padding of the output tensors for downstream kernels that
+    will benefit from padding.
+
+    Args:
+        input: The input tensor to be quantized to FP8
+        scale: Optional scaling factor for the FP8 quantization
+        scale_ub: Optional upper bound for scaling factor in dynamic
+            per token case
+        num_token_padding: If specified, pad the first dimension
+            of the output to at least this value.
+        use_per_token_if_dynamic: Whether to do per_tensor or per_token
+            in the dynamic quantization case.
+
+    Returns:
+        tuple[torch.Tensor, torch.Tensor]: The output tensor in FP8 and
+            scaling factor.
+    """
+    # This code assumes batch_dim and num_tokens are flattened
+    assert (input.ndim == 2)
+    shape: Union[tuple[int, int], torch.Size] = input.shape
+    # For ROCm on MI300, the output fp8 dtype is torch.float_e3m3fnuz
+    out_dtype: torch.dtype = current_platform.fp8_dtype()
+    if num_token_padding:
+        shape = (max(num_token_padding, input.shape[0]), shape[1])
+    if output is None:
+        output = torch.empty(shape, device=input.device, dtype=out_dtype)
+    else:
+        assert num_token_padding is None, \
+            "padding not supported if output passed in"
+        assert output.dtype == out_dtype
+
+    if scale is None:
+        if use_per_token_if_dynamic:
+            scale = torch.empty((shape[0], 1),
+                                device=input.device,
+                                dtype=torch.float32)
+            torch.ops._C.dynamic_per_token_scaled_fp8_quant(
+                output, input, scale, scale_ub)
+        else:
+            scale = torch.empty(1, device=input.device, dtype=torch.float32)
+            torch.ops._C.dynamic_scaled_fp8_quant(output, input, scale)
+    else:
+        assert scale.numel() == 1, f"{scale.shape}"
+        torch.ops._C.static_scaled_fp8_quant(output, input, scale)
+
+    return output, scale


 # gptq allspark

--- a/vllm/compilation/activation_quant_fusion.py
+++ b/vllm/compilation/activation_quant_fusion.py
@@ -26,14 +26,14 @@ FP4_DTYPE = torch.uint8

 SILU_MUL_OP = torch.ops._C.silu_and_mul.default

-# FUSED_OPS: dict[QuantKey, OpOverload] = {
-#     kFp8StaticTensorSym: torch.ops._C.silu_and_mul_quant.default,  # noqa: E501
-# }
-# silu_and_mul_nvfp4_quant_supported = (current_platform.is_cuda() and hasattr(
-#     torch.ops._C, "silu_and_mul_nvfp4_quant"))
-# if silu_and_mul_nvfp4_quant_supported:
-#     FUSED_OPS[
-#         kNvfp4Quant] = torch.ops._C.silu_and_mul_nvfp4_quant.default  # noqa: E501
+FUSED_OPS: dict[QuantKey, OpOverload] = {
+    kFp8StaticTensorSym: torch.ops._C.silu_and_mul_quant.default,  # noqa: E501
+}
+silu_and_mul_nvfp4_quant_supported = (current_platform.is_cuda() and hasattr(
+    torch.ops._C, "silu_and_mul_nvfp4_quant"))
+if silu_and_mul_nvfp4_quant_supported:
+    FUSED_OPS[
+        kNvfp4Quant] = torch.ops._C.silu_and_mul_nvfp4_quant.default  # noqa: E501


 class ActivationQuantPattern(ABC):

--- a/vllm/compilation/fix_functionalization.py
+++ b/vllm/compilation/fix_functionalization.py
@@ -68,15 +68,15 @@ class FixFunctionalizationPass(VllmInductorPass):
            elif at_target == torch.ops._C.fused_add_rms_norm.default:
                mutated_args = {1: 'input', 2: 'residual'}
                self.defunctionalize(graph, node, mutated_args)
-            # elif at_target == torch.ops._C.fused_add_rms_norm_static_fp8_quant.default:  # noqa: E501
-            #     mutated_args = {1: 'result', 2: 'residual'}
-            #     self.defunctionalize(graph, node, mutated_args)
+            elif at_target == torch.ops._C.fused_add_rms_norm_static_fp8_quant.default:  # noqa: E501
+                mutated_args = {1: 'result', 2: 'residual'}
+                self.defunctionalize(graph, node, mutated_args)
            elif at_target == torch.ops._C.rms_norm_dynamic_per_token_quant.default:  # noqa: E501
                mutated_args = {1: 'result', 2: 'scale', 3: 'residual'}
                self.defunctionalize(graph, node, mutated_args)
            elif at_target in [
                    torch.ops._C.rms_norm.default,
-                    # torch.ops._C.rms_norm_static_fp8_quant.default,
+                    torch.ops._C.rms_norm_static_fp8_quant.default,
            ]:
                mutated_args = {1: 'result'}
                self.defunctionalize(graph, node, mutated_args)
@@ -89,12 +89,12 @@ class FixFunctionalizationPass(VllmInductorPass):
                                     node,
                                     mutated_args,
                                     args=('result', 'input'))
-            # elif at_target == torch.ops._C.silu_and_mul_quant.default:
-            #     mutated_args = {1: 'result'}
-            #     self.defunctionalize(graph,
-            #                          node,
-            #                          mutated_args,
-            #                          args=('result', 'input', 'scale'))
+            elif at_target == torch.ops._C.silu_and_mul_quant.default:
+                mutated_args = {1: 'result'}
+                self.defunctionalize(graph,
+                                     node,
+                                     mutated_args,
+                                     args=('result', 'input', 'scale'))
            # elif hasattr(
            #         torch.ops._C, "silu_and_mul_nvfp4_quant"
            # ) and at_target == torch.ops._C.silu_and_mul_nvfp4_quant.default:

--- a/vllm/compilation/fusion.py
+++ b/vllm/compilation/fusion.py
@@ -40,12 +40,12 @@ RMS_OP = torch.ops._C.rms_norm.default
 RMS_ADD_OP = torch.ops._C.fused_add_rms_norm.default

 QUANT_OPS: dict[QuantKey, OpOverload] = {
-    # kFp8StaticTensorSym:
-    # torch.ops._C.static_scaled_fp8_quant.default,  # noqa: E501
-    # kFp8DynamicTensorSym:
-    # torch.ops._C.dynamic_scaled_fp8_quant.default,  # noqa: E501
-    # kFp8DynamicTokenSym:
-    # torch.ops._C.dynamic_per_token_scaled_fp8_quant.default,  # noqa: E501
+    kFp8StaticTensorSym:
+    torch.ops._C.static_scaled_fp8_quant.default,  # noqa: E501
+    kFp8DynamicTensorSym:
+    torch.ops._C.dynamic_scaled_fp8_quant.default,  # noqa: E501
+    kFp8DynamicTokenSym:
+    torch.ops._C.dynamic_per_token_scaled_fp8_quant.default,  # noqa: E501
 }
 if current_platform.is_cuda() and hasattr(torch.ops._C, "scaled_fp4_quant"):
    QUANT_OPS[kNvfp4Quant] = torch.ops._C.scaled_fp4_quant.default
@@ -66,14 +66,14 @@ class FusedRMSQuantKey(NamedTuple):


 FUSED_OPS: dict[FusedRMSQuantKey, OpOverload] = {
-    # FusedRMSQuantKey(kFp8StaticTensorSym, False):
-    # torch.ops._C.rms_norm_static_fp8_quant.default,  # noqa: E501
-    # FusedRMSQuantKey(kFp8StaticTensorSym, True):
-    # torch.ops._C.fused_add_rms_norm_static_fp8_quant.default,  # noqa: E501
-    # FusedRMSQuantKey(kFp8DynamicTokenSym, False):
-    # torch.ops._C.rms_norm_dynamic_per_token_quant.default,  # noqa: E501
-    # FusedRMSQuantKey(kFp8DynamicTokenSym, True):
-    # torch.ops._C.rms_norm_dynamic_per_token_quant.default,  # noqa: E501
+    FusedRMSQuantKey(kFp8StaticTensorSym, False):
+    torch.ops._C.rms_norm_static_fp8_quant.default,  # noqa: E501
+    FusedRMSQuantKey(kFp8StaticTensorSym, True):
+    torch.ops._C.fused_add_rms_norm_static_fp8_quant.default,  # noqa: E501
+    FusedRMSQuantKey(kFp8DynamicTokenSym, False):
+    torch.ops._C.rms_norm_dynamic_per_token_quant.default,  # noqa: E501
+    FusedRMSQuantKey(kFp8DynamicTokenSym, True):
+    torch.ops._C.rms_norm_dynamic_per_token_quant.default,  # noqa: E501
 }


@@ -351,22 +351,22 @@ class RMSNormQuantFusionPass(VllmPatternMatcherPass):
        self.patterns: PatternMatcherPass = PatternMatcherPass(
            pass_name="rmsnorm_quant_fusion_pass")

-        # for epsilon in [1e-5, 1e-6]:
+        for epsilon in [1e-5, 1e-6]:
            # Fuse rms_norm + static fp8 quant
-            # RMSNormStaticQuantPattern(epsilon,
-            #                           FP8_DTYPE).register(self.patterns)
+            RMSNormStaticQuantPattern(epsilon,
+                                      FP8_DTYPE).register(self.patterns)

            # Fuse fused_add_rms_norm + static fp8 quant
-            # FusedAddRMSNormStaticQuantPattern(epsilon, FP8_DTYPE).register(
-            #     self.patterns)
+            FusedAddRMSNormStaticQuantPattern(epsilon, FP8_DTYPE).register(
+                self.patterns)

            # # Fuse rms_norm + dynamic per-token fp8 quant
-            # RMSNormDynamicQuantPattern(epsilon,
-            #                            FP8_DTYPE).register(self.patterns)
+            RMSNormDynamicQuantPattern(epsilon,
+                                       FP8_DTYPE).register(self.patterns)

            # # Fuse fused_add_rms_norm + dynamic per-token fp8 quant
-            # FusedAddRMSNormDynamicQuantPattern(epsilon, FP8_DTYPE).register(
-            #     self.patterns)
+            FusedAddRMSNormDynamicQuantPattern(epsilon, FP8_DTYPE).register(
+                self.patterns)

        self.dump_patterns(config, self.patterns)


--- a/vllm/compilation/sequence_parallelism.py
+++ b/vllm/compilation/sequence_parallelism.py
@@ -446,16 +446,16 @@ class SequenceParallelismPass(VllmPatternMatcherPass):

        for epsilon in [1e-5, 1e-6]:
            # RMSNorm + Static FP8 quantization patterns
-            # fp8_quant_op = torch.ops._C.static_scaled_fp8_quant.default
-            # FirstAllReduceRMSNormStaticFP8Pattern(
-            #     epsilon, self.model_dtype, self.device,
-            #     fp8_quant_op).register(self.patterns)
-            # MiddleAllReduceRMSNormStaticFP8Pattern(
-            #     epsilon, self.model_dtype, self.device,
-            #     fp8_quant_op).register(self.patterns)
-            # LastAllReduceRMSNormStaticFP8Pattern(
-            #     epsilon, self.model_dtype, self.device,
-            #     fp8_quant_op).register(self.patterns)
+            fp8_quant_op = torch.ops._C.static_scaled_fp8_quant.default
+            FirstAllReduceRMSNormStaticFP8Pattern(
+                epsilon, self.model_dtype, self.device,
+                fp8_quant_op).register(self.patterns)
+            MiddleAllReduceRMSNormStaticFP8Pattern(
+                epsilon, self.model_dtype, self.device,
+                fp8_quant_op).register(self.patterns)
+            LastAllReduceRMSNormStaticFP8Pattern(
+                epsilon, self.model_dtype, self.device,
+                fp8_quant_op).register(self.patterns)

            # Normal RMSNorm patterns
            FirstAllReduceRMSNormPattern(epsilon, self.model_dtype,

--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -214,7 +214,6 @@ if TYPE_CHECKING:
    VLLM_USE_OPT_OP: bool = False
    VLLM_USE_TC_PAGED_ATTN: bool = False
    VLLM_USE_PA_PRINT_PARAM: bool = False 
-    VLLM_TREE_DECODING: bool = False
    VLLM_SPEC_DECODE_EAGER: bool = False
    VLLM_PCIE_USE_CUSTOM_ALLREDUCE: bool = False
    VLLM_CUSTOM_ALLREDUCE_SUPPORTED_WORLDSIZE_MAX: int = 16
@@ -1545,12 +1544,6 @@ environment_variables: dict[str, Callable[[], Any]] = {
    lambda: (os.environ.get("VLLM_USE_PA_PRINT_PARAM", "False").lower() in
             ("true", "1")),
    
-    # If set, vLLM will use tree-style speculative decoding.
-    "VLLM_TREE_DECODING":
-    lambda: 
-    (os.environ.get("VLLM_TREE_DECODING", "0").strip().lower() in
-     ("1", "true")),
-    
    # If set, vLLM will disable the draft model in cudagraph mode.
    "VLLM_SPEC_DECODE_EAGER":
    lambda: bool(int(os.getenv("VLLM_SPEC_DECODE_EAGER", "0"))),

--- a/vllm/v1/engine/llm_engine.py
+++ b/vllm/v1/engine/llm_engine.py
@@ -140,8 +140,6 @@ class LLMEngine:

        # Don't keep the dummy data in memory
        self.reset_mm_cache()
-        
-        # self.tree_decoding = os.environ.get('VLLM_TREE_DECODING') == '1'

    @classmethod
    def from_vllm_config(

--- a/vllm/worker/worker_base.py
+++ b/vllm/worker/worker_base.py
@@ -52,8 +52,6 @@ class WorkerBase:
    different hardware. Also abstracts control plane communication, e.g., to
    communicate request metadata to other workers.
    """
-    # TODO
-    tree_decoding = (os.environ.get('VLLM_TREE_DECODING') == '1')

    def __init__(
        self,