Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
98a011e9
Commit
98a011e9
authored
Dec 02, 2025
by
zhuwenwen
Browse files
restore the initial fp8 related implementation
remove medusa related files
parent
80c483dd
Changes
17
Expand all
Hide whitespace changes
Inline
Side-by-side
Showing
17 changed files
with
191 additions
and
1347 deletions
+191
-1347
CMakeLists.txt
CMakeLists.txt
+3
-3
cmake/utils.cmake
cmake/utils.cmake
+1
-1
csrc/layernorm_quant_kernels.cu
csrc/layernorm_quant_kernels.cu
+0
-2
csrc/ops.h
csrc/ops.h
+23
-23
csrc/quantization/fused_kernels/quant_conversions.cuh
csrc/quantization/fused_kernels/quant_conversions.cuh
+1
-1
csrc/torch_bindings.cpp
csrc/torch_bindings.cpp
+38
-38
examples/medusa/README.md
examples/medusa/README.md
+0
-83
examples/medusa/medusa_benchmark_throughput.py
examples/medusa/medusa_benchmark_throughput.py
+0
-648
examples/medusa/medusa_weight_converter.py
examples/medusa/medusa_weight_converter.py
+0
-411
vllm/_custom_ops.py
vllm/_custom_ops.py
+74
-75
vllm/compilation/activation_quant_fusion.py
vllm/compilation/activation_quant_fusion.py
+8
-8
vllm/compilation/fix_functionalization.py
vllm/compilation/fix_functionalization.py
+10
-10
vllm/compilation/fusion.py
vllm/compilation/fusion.py
+23
-23
vllm/compilation/sequence_parallelism.py
vllm/compilation/sequence_parallelism.py
+10
-10
vllm/envs.py
vllm/envs.py
+0
-7
vllm/v1/engine/llm_engine.py
vllm/v1/engine/llm_engine.py
+0
-2
vllm/worker/worker_base.py
vllm/worker/worker_base.py
+0
-2
No files found.
CMakeLists.txt
View file @
98a011e9
...
...
@@ -266,15 +266,15 @@ set(VLLM_EXT_SRC
"csrc/attention/attention_with_mask_kernels_opt.cu"
"csrc/attention/attention_with_mask_kernels_opt_tc.cu"
"csrc/opt/layernorm_kernels_opt.cu"
#
"csrc/layernorm_quant_kernels.cu"
"csrc/layernorm_quant_kernels.cu"
"csrc/sampler.cu"
"csrc/cuda_view.cu"
# "csrc/quantization/gptq/q_gemm.cu"
"csrc/quantization/compressed_tensors/int8_quant_kernels.cu"
#
"csrc/quantization/fp8/common.cu"
"csrc/quantization/fp8/common.cu"
"csrc/quantization/fused_kernels/fused_layernorm_dynamic_per_token_quant.cu"
"csrc/quantization/gguf/gguf_kernel.cu"
#
"csrc/quantization/activation_kernels.cu"
"csrc/quantization/activation_kernels.cu"
"csrc/cuda_utils_kernels.cu"
"csrc/custom_all_reduce.cu"
"csrc/torch_bindings.cpp"
)
...
...
cmake/utils.cmake
View file @
98a011e9
...
...
@@ -123,7 +123,7 @@ function (get_torch_gpu_compiler_flags OUT_GPU_FLAGS GPU_LANG)
list
(
APPEND GPU_FLAGS
"-DUSE_ROCM"
#
"-DENABLE_FP8"
"-DENABLE_FP8"
"-U__HIP_NO_HALF_CONVERSIONS__"
"-U__HIP_NO_HALF_OPERATORS__"
"-Werror=unused-variable"
...
...
csrc/layernorm_quant_kernels.cu
View file @
98a011e9
...
...
@@ -6,9 +6,7 @@
*/
#include "type_convert.cuh"
#ifndef USE_ROCM
#include "quantization/fp8/common.cuh"
#endif
#include "dispatch_utils.h"
#include "cub_helpers.h"
...
...
csrc/ops.h
View file @
98a011e9
...
...
@@ -224,15 +224,15 @@ void apply_repetition_penalties_(torch::Tensor& logits,
const
torch
::
Tensor
&
output_mask
,
const
torch
::
Tensor
&
repetition_penalties
);
//
void rms_norm_static_fp8_quant(torch::Tensor& out, torch::Tensor& input,
//
torch::Tensor& weight, torch::Tensor& scale,
//
double epsilon);
void
rms_norm_static_fp8_quant
(
torch
::
Tensor
&
out
,
torch
::
Tensor
&
input
,
torch
::
Tensor
&
weight
,
torch
::
Tensor
&
scale
,
double
epsilon
);
//
void fused_add_rms_norm_static_fp8_quant(torch::Tensor& out,
//
torch::Tensor& input,
//
torch::Tensor& residual,
//
torch::Tensor& weight,
//
torch::Tensor& scale, double epsilon);
void
fused_add_rms_norm_static_fp8_quant
(
torch
::
Tensor
&
out
,
torch
::
Tensor
&
input
,
torch
::
Tensor
&
residual
,
torch
::
Tensor
&
weight
,
torch
::
Tensor
&
scale
,
double
epsilon
);
void
rms_norm_dynamic_per_token_quant
(
torch
::
Tensor
&
out
,
torch
::
Tensor
const
&
input
,
...
...
@@ -248,8 +248,8 @@ void rotary_embedding(torch::Tensor& positions, torch::Tensor& query,
void
silu_and_mul
(
torch
::
Tensor
&
out
,
torch
::
Tensor
&
input
);
//
void silu_and_mul_quant(torch::Tensor& out, torch::Tensor& input,
//
torch::Tensor& scale);
void
silu_and_mul_quant
(
torch
::
Tensor
&
out
,
torch
::
Tensor
&
input
,
torch
::
Tensor
&
scale
);
#ifndef USE_ROCM
void
silu_and_mul_nvfp4_quant
(
torch
::
Tensor
&
out
,
...
...
@@ -257,12 +257,12 @@ void silu_and_mul_nvfp4_quant(torch::Tensor& out,
torch
::
Tensor
&
input
,
torch
::
Tensor
&
input_global_scale
);
#endif
//
void silu_mul_fp8_quant_deep_gemm_cuda(
//
const at::Tensor& input, // (E, T, 2*H)
//
const at::Tensor& counts, // (E)
//
at::Tensor& y_q, // (E, T, H) [OUT]
//
at::Tensor& y_s, // (E, T, H//group_size) [OUT]
//
int64_t group_size, bool use_ue8m0, int64_t num_parallel_tokens);
void
silu_mul_fp8_quant_deep_gemm_cuda
(
const
at
::
Tensor
&
input
,
// (E, T, 2*H)
const
at
::
Tensor
&
counts
,
// (E)
at
::
Tensor
&
y_q
,
// (E, T, H) [OUT]
at
::
Tensor
&
y_s
,
// (E, T, H//group_size) [OUT]
int64_t
group_size
,
bool
use_ue8m0
,
int64_t
num_parallel_tokens
);
void
mul_and_silu
(
torch
::
Tensor
&
out
,
torch
::
Tensor
&
input
);
...
...
@@ -438,15 +438,15 @@ void dynamic_scaled_int8_quant(torch::Tensor& out, torch::Tensor const& input,
// void gptq_shuffle(torch::Tensor q_weight, torch::Tensor q_perm, int64_t bit);
//
void static_scaled_fp8_quant(torch::Tensor& out, torch::Tensor const& input,
//
torch::Tensor const& scale);
void
static_scaled_fp8_quant
(
torch
::
Tensor
&
out
,
torch
::
Tensor
const
&
input
,
torch
::
Tensor
const
&
scale
);
//
void dynamic_scaled_fp8_quant(torch::Tensor& out, torch::Tensor const& input,
//
torch::Tensor& scale);
void
dynamic_scaled_fp8_quant
(
torch
::
Tensor
&
out
,
torch
::
Tensor
const
&
input
,
torch
::
Tensor
&
scale
);
//
void dynamic_per_token_scaled_fp8_quant(
//
torch::Tensor& out, torch::Tensor const& input, torch::Tensor& scale,
//
std::optional<torch::Tensor> const& scale_ub);
void
dynamic_per_token_scaled_fp8_quant
(
torch
::
Tensor
&
out
,
torch
::
Tensor
const
&
input
,
torch
::
Tensor
&
scale
,
std
::
optional
<
torch
::
Tensor
>
const
&
scale_ub
);
void
selective_scan_fwd
(
const
torch
::
Tensor
&
u
,
const
torch
::
Tensor
&
delta
,
const
torch
::
Tensor
&
A
,
const
torch
::
Tensor
&
B
,
...
...
csrc/quantization/fused_kernels/quant_conversions.cuh
View file @
98a011e9
...
...
@@ -6,7 +6,7 @@
#include "quantization/vectorization.cuh"
// TODO(luka/varun):refactor common.cuh to use this file instead
//
#include "quantization/fp8/common.cuh"
#include "quantization/fp8/common.cuh"
namespace
vllm
{
...
...
csrc/torch_bindings.cpp
View file @
98a011e9
...
...
@@ -32,12 +32,12 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
#define stride_tag
#endif
//
ops.def(
//
"silu_mul_fp8_quant_deep_gemm_cuda(Tensor input, Tensor counts, Tensor! "
//
"y_q, Tensor! y_s, int group_size, "
//
"bool use_ue8m0, int num_parallel_tokens) -> ()");
//
ops.impl("silu_mul_fp8_quant_deep_gemm_cuda", torch::kCUDA,
//
&silu_mul_fp8_quant_deep_gemm_cuda);
ops
.
def
(
"silu_mul_fp8_quant_deep_gemm_cuda(Tensor input, Tensor counts, Tensor! "
"y_q, Tensor! y_s, int group_size, "
"bool use_ue8m0, int num_parallel_tokens) -> ()"
);
ops
.
impl
(
"silu_mul_fp8_quant_deep_gemm_cuda"
,
torch
::
kCUDA
,
&
silu_mul_fp8_quant_deep_gemm_cuda
);
ops
.
def
(
"weak_ref_tensor(Tensor input) -> Tensor"
);
ops
.
impl
(
"weak_ref_tensor"
,
torch
::
kCUDA
,
&
weak_ref_tensor
);
...
...
@@ -269,9 +269,9 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
ops
.
def
(
"silu_and_mul(Tensor! result, Tensor input) -> ()"
);
ops
.
impl
(
"silu_and_mul"
,
torch
::
kCUDA
,
&
silu_and_mul
);
//
ops.def(
//
"silu_and_mul_quant(Tensor! result, Tensor input, Tensor scale) -> ()");
//
ops.impl("silu_and_mul_quant", torch::kCUDA, &silu_and_mul_quant);
ops
.
def
(
"silu_and_mul_quant(Tensor! result, Tensor input, Tensor scale) -> ()"
);
ops
.
impl
(
"silu_and_mul_quant"
,
torch
::
kCUDA
,
&
silu_and_mul_quant
);
#ifndef USE_ROCM
ops
.
def
(
...
...
@@ -366,20 +366,20 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
// Layernorm-quant
// Apply Root Mean Square (RMS) Normalization to the input tensor.
//
ops.def(
//
"rms_norm_static_fp8_quant(Tensor! result, Tensor input, Tensor weight, "
//
"Tensor scale, float epsilon) -> "
//
"()");
//
ops.impl("rms_norm_static_fp8_quant", torch::kCUDA,
//
&rms_norm_static_fp8_quant);
ops
.
def
(
"rms_norm_static_fp8_quant(Tensor! result, Tensor input, Tensor weight, "
"Tensor scale, float epsilon) -> "
"()"
);
ops
.
impl
(
"rms_norm_static_fp8_quant"
,
torch
::
kCUDA
,
&
rms_norm_static_fp8_quant
);
// In-place fused Add and RMS Normalization.
//
ops.def(
//
"fused_add_rms_norm_static_fp8_quant(Tensor! result, Tensor input, "
//
"Tensor! residual, Tensor weight, "
//
"Tensor scale, float epsilon) -> ()");
//
ops.impl("fused_add_rms_norm_static_fp8_quant", torch::kCUDA,
//
&fused_add_rms_norm_static_fp8_quant);
ops
.
def
(
"fused_add_rms_norm_static_fp8_quant(Tensor! result, Tensor input, "
"Tensor! residual, Tensor weight, "
"Tensor scale, float epsilon) -> ()"
);
ops
.
impl
(
"fused_add_rms_norm_static_fp8_quant"
,
torch
::
kCUDA
,
&
fused_add_rms_norm_static_fp8_quant
);
// Fused Layernorm + Quant kernels
ops
.
def
(
...
...
@@ -741,25 +741,25 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
// ops.impl("gptq_shuffle", torch::kCUDA, &gptq_shuffle);
// Compute FP8 quantized tensor for given scaling factor.
//
ops.def(
//
"static_scaled_fp8_quant(Tensor! result, Tensor input, Tensor scale) -> "
//
"()");
//
ops.impl("static_scaled_fp8_quant", torch::kCUDA, &static_scaled_fp8_quant);
ops
.
def
(
"static_scaled_fp8_quant(Tensor! result, Tensor input, Tensor scale) -> "
"()"
);
ops
.
impl
(
"static_scaled_fp8_quant"
,
torch
::
kCUDA
,
&
static_scaled_fp8_quant
);
//
// Compute dynamic-per-tensor FP8 quantized tensor and scaling factor.
//
ops.def(
//
"dynamic_scaled_fp8_quant(Tensor! result, Tensor input, Tensor! scale) "
//
"-> "
//
"()");
//
ops.impl("dynamic_scaled_fp8_quant", torch::kCUDA, &dynamic_scaled_fp8_quant);
// Compute dynamic-per-tensor FP8 quantized tensor and scaling factor.
ops
.
def
(
"dynamic_scaled_fp8_quant(Tensor! result, Tensor input, Tensor! scale) "
"-> "
"()"
);
ops
.
impl
(
"dynamic_scaled_fp8_quant"
,
torch
::
kCUDA
,
&
dynamic_scaled_fp8_quant
);
//
// Compute dynamic-per-token FP8 quantized tensor and scaling factor.
//
ops.def(
//
"dynamic_per_token_scaled_fp8_quant(Tensor! result, Tensor input, "
//
"Tensor! scale, Tensor? scale_ub) -> "
//
"()");
//
ops.impl("dynamic_per_token_scaled_fp8_quant", torch::kCUDA,
//
&dynamic_per_token_scaled_fp8_quant);
// Compute dynamic-per-token FP8 quantized tensor and scaling factor.
ops
.
def
(
"dynamic_per_token_scaled_fp8_quant(Tensor! result, Tensor input, "
"Tensor! scale, Tensor? scale_ub) -> "
"()"
);
ops
.
impl
(
"dynamic_per_token_scaled_fp8_quant"
,
torch
::
kCUDA
,
&
dynamic_per_token_scaled_fp8_quant
);
// Compute int8 quantized tensor for given scaling factor.
ops
.
def
(
...
...
examples/medusa/README.md
deleted
100644 → 0
View file @
80c483dd
# Medusa Decoding
本文说明如何使用vllm构建和运行medusa模型
## Overview
Medusa是一种大模型并行解码算法,除了支持官方提供的Top1-proposer,我们还支持tree-style并行解码,target model和draft model均可多卡推理
与其他模型不同,medusa解码需要一个base model和若干Medusa heads.
Vllm medusa model的实现在[vllm/model_executor/models/medusa.py]
## Support Matrix
*
FP16
*
BF16
*
PAGED_KV_CACHE
*
Tensor Parallel
### convert Medusa model weights
# medusa 模型需要转换为vllm中Medusa的模型格式
```
bash
python medusa_weight_converter.py
--medusa_num_heads
4
--medusa_num_layers
1
--medusa_model_path
/work/model.bin
--vocab_size
152064
--hidden_size
8192
--output_dir
/work/medusa/vllm-medusa-qwen2-72b-head-4
--medusa_choices
=
"[(0), (0, 0), (0, 0, 0), (0, 1), (1), (1, 0), (0, 0, 0, 0), (0, 0, 1), (0, 2), (0, 1, 0), (2), (0, 0, 2), (0, 3), (1, 0, 0), (2, 0), (0, 2, 0), (0, 4), (0, 0, 3), (3), (0, 0, 0, 1), (0, 5), (0, 0, 1, 0), (0, 0, 4)]"
```
此处model.bin是训练后保存的medusa head权重,如果希望采用Top1-proposer,medusa_choices可以不设置
### Run tree-style generation server
```
bash
VLLM_TREE_DECODING
=
1 python3
-m
vllm.entrypoints.openai.api_server
\
--served-model-name
qwen_medusa
\
--model
/models/Qwen2-72B-Instruct/
-tp
4
\
--max-model-len
1024
--max-num-seqs
8
--gpu-memory-utilization
0.8
\
--speculative-model
/work/medusa/vllm-medusa-qwen2-72b-head-4
\
--speculative-draft-tensor-parallel-size
4
\
--speculative-disable-by-batch-size
9
\
--use-v2-block-manager
\
--spec-decoding-acceptance-method
typical_acceptance_sampler
\
--dtype
float16
--trust-remote-code
--port
8086
\
--num-speculative-heads
4
--num-speculative-tokens
24
```
注意:
num_speculative_tokens = len(medusa_choices) + 1
medusa_choices个数不能太多,否则多batch下会降低推理速度
speculative-disable-by-batch-size要大于max-num-seqs,否则当batch等于max-num-seqs时,不会走并行解码
### Run Top1-proposer server
python3 -m vllm.entrypoints.openai.api_server
\
--served-model-name qwen_medusa
\
--model /models/Qwen2-72B-Instruct/ -tp 4
\
--max-model-len 1024 --max-num-seqs 8 --gpu-memory-utilization 0.8
\
--speculative-model /work/medusa/vllm-medusa-qwen2-72b-head-4
\
--speculative-draft-tensor-parallel-size 4
\
--speculative-disable-by-batch-size 9
\
--use-v2-block-manager
\
--spec-decoding-acceptance-method typical_acceptance_sampler
\
--dtype float16 --trust-remote-code --port 8086
\
--num-speculative-tokens 4
注意:
使用Top1-proposer时,num-speculative-tokens就是medusa head的个数
# do request
```
bash
curl http://localhost:8086/v1/completions
\
-H
"Content-Type: application/json"
\
-d
'{
"model": "qwen_medusa",
"prompt": "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n帮我写一个C++的快速排序算法<|im_end|>\n<|im_start|>assistant\n",
"max_tokens": 256,
"temperature": 0.0
}'
```
### Run tree-style benchmark
```
bash
VLLM_TREE_DECODING
=
1 python /work/test/medusa_benchmark_throughput.py
--model
/models/Qwen2-72B-Instruct/
-tp
4
--dtype
float16
--trust-remote-code
--max-num-seqs
4
--speculative-model
/work/medusa/vllm-medusa1-qwen2-72b-head-4
--speculative-draft-tensor-parallel-size
4
--speculative-disable-by-batch-size
9
--use-v2-block-manager
--spec-decoding-acceptance-method
typical_acceptance_sampler
--max-model-len
1024
--dataset
/work/medusa_benchmark_data.json
--num-speculative-heads
4
--num-speculative-tokens
24
--gpu-memory-utilization
0.95
```
### Run Top1-proposer benchmark
```
bash
python /work/test/medusa_benchmark_throughput.py
--model
/models/Qwen2-72B-Instruct/
-tp
4
--dtype
float16
--trust-remote-code
--max-num-seqs
4
--speculative-model
/work/medusa/vllm-medusa1-qwen2-72b-head-4
--speculative-draft-tensor-parallel-size
4
--speculative-disable-by-batch-size
9
--use-v2-block-manager
--spec-decoding-acceptance-method
typical_acceptance_sampler
--max-model-len
1024
--dataset
/work/medusa_benchmark_data.json
--num-speculative-tokens
4
--gpu-memory-utilization
0.95
```
可设置max-num-seqs对不同的batch进行性能测试
examples/medusa/medusa_benchmark_throughput.py
deleted
100644 → 0
View file @
80c483dd
This diff is collapsed.
Click to expand it.
examples/medusa/medusa_weight_converter.py
deleted
100644 → 0
View file @
80c483dd
import
os
import
ast
from
pathlib
import
Path
from
typing
import
Iterable
,
List
,
Optional
,
Tuple
,
Union
from
addict
import
Dict
import
yaml
import
argparse
import
torch
import
torch.nn
as
nn
import
torch.nn.functional
as
F
from
torch.nn.parameter
import
Parameter
from
transformers
import
PretrainedConfig
from
safetensors.torch
import
save_model
,
safe_open
from
vllm.model_executor.layers.linear
import
UnquantizedLinearMethod
from
vllm.model_executor.layers.quantization.base_config
import
(
QuantizationConfig
,
QuantizeMethodBase
)
from
vllm.model_executor.utils
import
set_weight_attrs
DEFAULT_VOCAB_PADDING_SIZE
=
64
TRAINED_BLOCK_WEIGHT_NAME_TEMPLATE
=
'medusa_head.{}.{}.linear.weight'
TRAINED_MEDUSA_HEADS_NEMA_TEMPLATE
=
'medusa_head.{}.1.weight'
TRAINED_BLOCK_BIAS_NAME_TEMPLATE
=
'medusa_head.{}.{}.linear.bias'
VLLM_BLOCK_WEIGHT_NAME_TEMPLATE
=
'blocks.{}.layers.{}.weight'
VLLM_BLOCK_BIAS_NAME_TEMPLATE
=
'blocks.{}.layers.{}.bias'
VLLM_MEDUSA_HEADS_WEIGHT_NAME_TEMPLATE
=
'lm_heads.{}.weight'
def
default_weight_loader
(
param
:
torch
.
Tensor
,
loaded_weight
:
torch
.
Tensor
)
->
None
:
"""Default weight loader."""
assert
param
.
size
()
==
loaded_weight
.
size
()
param
.
data
.
copy_
(
loaded_weight
)
def
pad_vocab_size
(
vocab_size
:
int
,
pad_to
:
int
=
DEFAULT_VOCAB_PADDING_SIZE
)
->
int
:
"""Pad the vocab size to the given value."""
return
((
vocab_size
+
pad_to
-
1
)
//
pad_to
)
*
pad_to
class
MedusaConfig
(
PretrainedConfig
):
model_type
=
"medusa"
def
__init__
(
self
,
hidden_size
:
int
=
4096
,
vocab_size
:
int
=
32001
,
num_heads
:
int
=
5
,
num_hidden_layers
:
int
=
1
,
max_paths
:
int
=
64
,
topk
:
int
=
10
,
truncated_vocab_size
:
Optional
[
int
]
=
None
,
**
kwargs
):
self
.
hidden_size
=
hidden_size
self
.
vocab_size
=
vocab_size
self
.
num_heads
=
num_heads
self
.
num_hidden_layers
=
num_hidden_layers
self
.
max_paths
=
max_paths
self
.
topk
=
topk
self
.
max_seq_len
=
int
(
2
**
20
)
self
.
truncated_vocab_size
=
vocab_size
if
truncated_vocab_size
is
None
\
else
truncated_vocab_size
if
"architectures"
not
in
kwargs
:
kwargs
[
"architectures"
]
=
[
"MedusaModel"
]
super
().
__init__
(
**
kwargs
)
@
property
def
num_attention_heads
(
self
):
return
0
@
property
def
num_lookahead_tokens
(
self
):
return
self
.
num_heads
@
num_lookahead_tokens
.
setter
def
num_lookahead_tokens
(
self
,
num_lookahead_tokens
:
int
):
self
.
num_heads
=
num_lookahead_tokens
class
VocabParallelEmbedding
(
torch
.
nn
.
Module
):
"""Embedding parallelized in the vocabulary dimension.
Adapted from torch.nn.Embedding, note that we pad the vocabulary size to
make sure it is divisible by the number of model parallel GPUs.
In order to support various loading methods, we ensure that LoRA-added
embeddings are always at the end of TP-sharded tensors. In other words,
we shard base embeddings and LoRA embeddings separately (both padded),
and place them in the same tensor.
In this example, we will have the original vocab size = 1010,
added vocab size = 16 and padding to 64. Therefore, the total
vocab size with padding will be 1088 (because we first pad 1010 to
1024, add 16, and then pad to 1088).
Therefore, the tensor format looks like the following:
TP1, rank 0 (no sharding):
|< --------BASE-------- >|< -BASE PADDING-- >|< -----LORA------ >|< -LORA PADDING-- >|
corresponding token_id: | 0 | 1 | ... | 1009 | -1 | ... | -1 | 1010 | ... | 1015 | -1 | ... | -1 |
index: | 0 | 1 | ... | 1009 | 1010 | ... | 1023 | 1024 | ... | 1039 | 1040 | ... | 1087 |
TP2, rank 0:
|< --------------------BASE--------------------- >|< -----LORA------ >|< -LORA PADDING- >|
corresponding token_id: | 0 | 1 | 2 | ... | 497 | 498 | ... | 511 | 1000 | ... | 1015 | -1 | ... | -1 |
index: | 0 | 1 | 2 | ... | 497 | 498 | ... | 511 | 512 | ... | 527 | 520 | ... | 543 |
TP2, rank 1:
|< -----------BASE----------- >|< -BASE PADDING- >|< -----------LORA PADDING----------- >|
corresponding token_id: | 512 | 513 | 514 | ... | 1009 | -1 | ... | -1 | -1 | ... | -1 | -1 | ... | -1 |
index: | 0 | 1 | 2 | ... | 497 | 498 | ... | 511 | 512 | ... | 519 | 520 | ... | 543 |
Args:
num_embeddings: vocabulary size.
embedding_dim: size of hidden state.
params_dtype: type of the parameters.
org_num_embeddings: original vocabulary size (without LoRA).
padding_size: padding size for the vocabulary.
quant_config: quant config for the layer
prefix: full name of the layer in the state dict
"""
# noqa: E501
def
__init__
(
self
,
num_embeddings
:
int
,
embedding_dim
:
int
,
params_dtype
:
Optional
[
torch
.
dtype
]
=
None
,
org_num_embeddings
:
Optional
[
int
]
=
None
,
padding_size
:
int
=
DEFAULT_VOCAB_PADDING_SIZE
,
quant_config
:
Optional
[
QuantizationConfig
]
=
None
,
prefix
:
str
=
""
):
super
().
__init__
()
self
.
num_embeddings
=
num_embeddings
self
.
padding_size
=
padding_size
self
.
org_vocab_size
=
org_num_embeddings
or
num_embeddings
num_added_embeddings
=
num_embeddings
-
self
.
org_vocab_size
self
.
org_vocab_size_padded
=
pad_vocab_size
(
self
.
org_vocab_size
,
self
.
padding_size
)
self
.
num_embeddings_padded
=
pad_vocab_size
(
self
.
org_vocab_size_padded
+
num_added_embeddings
,
self
.
padding_size
)
assert
self
.
org_vocab_size_padded
<=
self
.
num_embeddings_padded
self
.
embedding_dim
=
embedding_dim
linear_method
=
None
if
quant_config
is
not
None
:
linear_method
=
quant_config
.
get_quant_method
(
self
,
prefix
=
prefix
)
if
linear_method
is
None
:
linear_method
=
UnquantizedLinearMethod
()
self
.
linear_method
:
QuantizeMethodBase
=
linear_method
if
params_dtype
is
None
:
params_dtype
=
torch
.
get_default_dtype
()
self
.
linear_method
.
create_weights
(
self
,
self
.
embedding_dim
,
[
self
.
num_embeddings_padded
],
self
.
embedding_dim
,
self
.
num_embeddings_padded
,
params_dtype
=
params_dtype
,
weight_loader
=
self
.
weight_loader
)
def
weight_loader
(
self
,
param
:
Parameter
,
loaded_weight
:
torch
.
Tensor
):
assert
param
.
data
.
shape
==
loaded_weight
.
shape
param
.
data
.
copy_
(
loaded_weight
)
def
forward
(
self
,
input_
):
masked_input
=
input_
# Get the embeddings.
output
=
F
.
embedding
(
masked_input
.
long
(),
self
.
weight
)
return
output
class
ParallelLMHead
(
VocabParallelEmbedding
):
"""Parallelized LM head.
Output logits weight matrices used in the Sampler. The weight and bias
tensors are padded to make sure they are divisible by the number of
model parallel GPUs.
Args:
num_embeddings: vocabulary size.
embedding_dim: size of hidden state.
bias: whether to use bias.
params_dtype: type of the parameters.
org_num_embeddings: original vocabulary size (without LoRA).
padding_size: padding size for the vocabulary.
"""
def
__init__
(
self
,
num_embeddings
:
int
,
embedding_dim
:
int
,
bias
:
bool
=
False
,
params_dtype
:
Optional
[
torch
.
dtype
]
=
None
,
org_num_embeddings
:
Optional
[
int
]
=
None
,
padding_size
:
int
=
DEFAULT_VOCAB_PADDING_SIZE
,
quant_config
:
Optional
[
QuantizationConfig
]
=
None
,
prefix
:
str
=
""
):
super
().
__init__
(
num_embeddings
,
embedding_dim
,
params_dtype
,
org_num_embeddings
,
padding_size
,
quant_config
,
prefix
)
if
bias
:
self
.
bias
=
Parameter
(
torch
.
empty
(
self
.
num_embeddings_per_partition
,
dtype
=
params_dtype
))
set_weight_attrs
(
self
.
bias
,
{
"output_dim"
:
0
,
"weight_loader"
:
self
.
weight_loader
,
})
else
:
self
.
register_parameter
(
"bias"
,
None
)
def
forward
(
self
,
input_
):
del
input_
raise
RuntimeError
(
"LMHead's weights should be used in the sampler."
)
class
ResidualBlock
(
nn
.
Module
):
def
__init__
(
self
,
hidden_size
:
int
,
num_layers
:
int
)
->
None
:
super
().
__init__
()
self
.
layers
=
nn
.
ModuleList
([
nn
.
Linear
(
hidden_size
,
hidden_size
)
for
_
in
range
(
num_layers
)
])
self
.
act
=
nn
.
SiLU
()
def
forward
(
self
,
x
:
torch
.
Tensor
)
->
torch
.
Tensor
:
for
layer
in
self
.
layers
:
x
=
x
+
self
.
act
(
layer
(
x
))
return
x
class
Medusa
(
nn
.
Module
):
def
__init__
(
self
,
config
:
MedusaConfig
,
**
_
)
->
None
:
super
().
__init__
()
self
.
config
=
config
self
.
blocks
=
nn
.
ModuleList
([
ResidualBlock
(
hidden_size
=
self
.
config
.
hidden_size
,
num_layers
=
self
.
config
.
num_hidden_layers
)
for
_
in
range
(
self
.
config
.
num_heads
)
])
self
.
orig_vocab_size
=
config
.
vocab_size
self
.
truncated_vocab_size
=
config
.
truncated_vocab_size
self
.
unpadded_vocab_size
=
self
.
truncated_vocab_size
self
.
lm_heads
=
nn
.
ModuleList
([
ParallelLMHead
(
self
.
unpadded_vocab_size
,
config
.
hidden_size
,
org_num_embeddings
=
self
.
truncated_vocab_size
,
padding_size
=
DEFAULT_VOCAB_PADDING_SIZE
,
)
for
_
in
range
(
self
.
config
.
num_heads
)
])
logit_scale
=
getattr
(
config
,
"logit_scale"
,
1.0
)
self
.
token_map
=
None
def
forward
(
self
,
hidden_states
:
torch
.
Tensor
)
->
List
[
torch
.
Tensor
]:
return
[
block
(
hidden_states
)
for
block
in
self
.
blocks
]
def
load_weights
(
self
,
weights
:
Iterable
[
Tuple
[
str
,
torch
.
Tensor
]]):
params_dict
=
dict
(
self
.
named_parameters
())
weights_map
=
{}
for
name
,
loaded_weight
in
weights
:
name
=
name
.
replace
(
"medusa_heads."
,
""
)
if
name
==
"token_map"
:
if
self
.
truncated_vocab_size
<
self
.
orig_vocab_size
:
self
.
token_map
=
nn
.
Parameter
(
loaded_weight
,
requires_grad
=
False
)
elif
name
in
params_dict
:
weights_map
[
name
]
=
loaded_weight
for
name
,
loaded_weight
in
weights_map
.
items
():
if
"lm_head"
in
name
and
self
.
token_map
is
not
None
and
\
loaded_weight
.
shape
[
0
]
>
self
.
token_map
.
shape
[
0
]:
loaded_weight
=
loaded_weight
[
self
.
token_map
]
param
=
params_dict
[
name
]
weight_loader
=
getattr
(
param
,
"weight_loader"
,
default_weight_loader
)
weight_loader
(
param
,
loaded_weight
)
if
self
.
token_map
is
not
None
:
self
.
token_map
.
to
(
device
=
self
.
lm_heads
[
0
].
weight
.
device
)
assert
(
self
.
truncated_vocab_size
==
self
.
orig_vocab_size
)
or
(
self
.
token_map
is
not
None
)
class
CustomMedusaConfig
(
PretrainedConfig
):
model_type
=
"medusa"
def
__init__
(
self
,
name_or_path
:
str
=
"S-3000/vllm-medusa-qwen1.5-7b-chat"
,
architectures
:
list
[
str
]
=
[
"MedusaModel"
],
hidden_size
:
int
=
4096
,
model_type
:
str
=
"medusa"
,
num_heads
:
int
=
5
,
num_hidden_layers
:
int
=
1
,
transformers_version
:
str
=
"4.41.2"
,
truncated_vocab_size
:
Optional
[
int
]
=
None
,
vocab_size
:
int
=
151936
,
medusa_choices
:
List
[
List
[
int
]]
=
None
,
**
kwargs
):
super
().
__init__
(
**
kwargs
)
self
.
_name_or_path
=
name_or_path
self
.
architectures
=
architectures
self
.
hidden_size
=
hidden_size
self
.
model_type
=
model_type
self
.
num_heads
=
num_heads
self
.
num_hidden_layers
=
num_hidden_layers
self
.
transformers_version
=
transformers_version
self
.
truncated_vocab_size
=
truncated_vocab_size
self
.
vocab_size
=
vocab_size
self
.
medusa_choices
=
medusa_choices
def
main
(
args
):
medusa_head_num
=
args
.
medusa_num_heads
medusa_num_layers
=
args
.
medusa_num_layers
config
=
MedusaConfig
(
hidden_size
=
args
.
hidden_size
,
vocab_size
=
args
.
vocab_size
,
num_heads
=
medusa_head_num
)
medusa_model
=
Medusa
(
config
)
params_dict
=
dict
(
medusa_model
.
named_parameters
())
trained_medusa_model
=
torch
.
load
(
args
.
medusa_model_path
)
for
i
in
range
(
medusa_head_num
):
vllm_medusa_head_weight_name
=
VLLM_MEDUSA_HEADS_WEIGHT_NAME_TEMPLATE
.
format
(
i
)
trained_medusa_head_weight_name
=
TRAINED_MEDUSA_HEADS_NEMA_TEMPLATE
.
format
(
i
)
vllm_medusa_head_param
=
params_dict
[
vllm_medusa_head_weight_name
]
trained_medusa_head_param
=
trained_medusa_model
[
trained_medusa_head_weight_name
]
weight_loader
=
getattr
(
vllm_medusa_head_param
,
"weight_loader"
,
default_weight_loader
)
weight_loader
(
vllm_medusa_head_param
,
trained_medusa_head_param
)
for
i
in
range
(
medusa_head_num
):
for
j
in
range
(
medusa_num_layers
):
# load linear weight
vllm_medusa_block_weight_name
=
VLLM_BLOCK_WEIGHT_NAME_TEMPLATE
.
format
(
i
,
j
)
trained_medusa_block_weight_name
=
TRAINED_BLOCK_WEIGHT_NAME_TEMPLATE
.
format
(
i
,
j
)
vllm_medusa_block_param
=
params_dict
[
vllm_medusa_block_weight_name
]
trained_medusa_block_param
=
trained_medusa_model
[
trained_medusa_block_weight_name
]
weight_loader
=
getattr
(
vllm_medusa_block_param
,
"weight_loader"
,
default_weight_loader
)
weight_loader
(
vllm_medusa_block_param
,
trained_medusa_block_param
)
# load linear bias
vllm_medusa_block_bias_name
=
VLLM_BLOCK_BIAS_NAME_TEMPLATE
.
format
(
i
,
j
)
trained_medusa_block_bias_name
=
TRAINED_BLOCK_BIAS_NAME_TEMPLATE
.
format
(
i
,
j
)
vllm_medusa_block_bias_param
=
params_dict
[
vllm_medusa_block_bias_name
]
trained_medusa_block_bias_param
=
trained_medusa_model
[
trained_medusa_block_bias_name
]
weight_loader
=
getattr
(
vllm_medusa_block_bias_param
,
"weight_loader"
,
default_weight_loader
)
weight_loader
(
vllm_medusa_block_bias_param
,
trained_medusa_block_bias_param
)
if
not
Path
(
args
.
output_dir
).
is_dir
():
os
.
makedirs
(
args
.
output_dir
,
exist_ok
=
True
)
save_model
(
medusa_model
,
os
.
path
.
join
(
args
.
output_dir
,
"model.safetensors"
))
medusa_choices
=
ast
.
literal_eval
(
args
.
medusa_choices
)
if
args
.
medusa_choices
is
not
None
else
None
to_save_config
=
CustomMedusaConfig
(
name_or_path
=
os
.
path
.
join
(
args
.
output_dir
,
"config.json"
),
hidden_size
=
args
.
hidden_size
,
num_heads
=
medusa_head_num
,
num_hidden_layers
=
medusa_num_layers
,
vocab_size
=
args
.
vocab_size
,
medusa_choices
=
medusa_choices
)
to_save_config
.
save_pretrained
(
args
.
output_dir
)
# validate weight
# with safe_open(os.path.join(args.output_dir, "model.safetensors"), framework="pt") as f:
# param = f.get_tensor(VLLM_BLOCK_WEIGHT_NAME_TEMPLATE.format(3, 0))
# trained_param = trained_medusa_model[TRAINED_BLOCK_WEIGHT_NAME_TEMPLATE.format(3, 0)]
# mse_value = torch.nn.functional.mse_loss(param.cpu(), trained_param.cpu())
# print("weight mes:", mse_value)
if
__name__
==
"__main__"
:
parser
=
argparse
.
ArgumentParser
(
description
=
"Medusa Model Evaluator"
)
parser
.
add_argument
(
"--medusa_model_path"
,
type
=
str
,
required
=
True
,
help
=
"Path to the medusa model file."
)
parser
.
add_argument
(
"--vocab_size"
,
type
=
int
,
required
=
True
,
help
=
"Vocab size"
)
parser
.
add_argument
(
"--medusa_num_heads"
,
type
=
int
,
required
=
True
,
help
=
"Number of Medusa heads"
)
parser
.
add_argument
(
"--medusa_num_layers"
,
type
=
int
,
required
=
True
,
help
=
"Number of Medusa layers"
)
parser
.
add_argument
(
"--hidden_size"
,
type
=
int
,
required
=
True
,
help
=
"Hidden size"
)
parser
.
add_argument
(
"--output_dir"
,
type
=
str
,
required
=
True
,
help
=
"Output dir"
)
parser
.
add_argument
(
'--medusa_choices'
,
type
=
str
,
default
=
None
,
help
=
"Medusa choice to use, if not none, will use Medusa decoding."
" E.g.: [[0, 0, 0, 0], [0, 1, 0], [1, 0], [1, 1]] for 9 medusa tokens."
)
args
=
parser
.
parse_args
()
main
(
args
)
vllm/_custom_ops.py
View file @
98a011e9
...
...
@@ -901,20 +901,20 @@ def gptq_shuffle(q_weight: torch.Tensor, q_perm: torch.Tensor,
# return torch.empty_like(b, memory_format=torch.contiguous_format)
#
if hasattr(torch.ops._C, "allspark_w8a16_gemm"):
#
@register_fake("_C::allspark_w8a16_gemm")
#
def _allspark_w8a16_gemm_fake(a: torch.Tensor, b_qweight: torch.Tensor,
#
b_scales: torch.Tensor,
#
b_qzeros: Optional[torch.Tensor],
#
n: torch.SymInt, group_size: torch.SymInt,
#
sm_count: torch.SymInt,
#
sm_version: torch.SymInt,
#
CUBLAS_M_THRESHOLD: torch.SymInt,
#
has_zp: bool,
#
n32k16_reorder: bool) -> torch.Tensor:
#
m = a.size(0)
#
return torch.empty((m, n), device=a.device, dtype=a.dtype)
if
hasattr
(
torch
.
ops
.
_C
,
"allspark_w8a16_gemm"
):
@
register_fake
(
"_C::allspark_w8a16_gemm"
)
def
_allspark_w8a16_gemm_fake
(
a
:
torch
.
Tensor
,
b_qweight
:
torch
.
Tensor
,
b_scales
:
torch
.
Tensor
,
b_qzeros
:
Optional
[
torch
.
Tensor
],
n
:
torch
.
SymInt
,
group_size
:
torch
.
SymInt
,
sm_count
:
torch
.
SymInt
,
sm_version
:
torch
.
SymInt
,
CUBLAS_M_THRESHOLD
:
torch
.
SymInt
,
has_zp
:
bool
,
n32k16_reorder
:
bool
)
->
torch
.
Tensor
:
m
=
a
.
size
(
0
)
return
torch
.
empty
((
m
,
n
),
device
=
a
.
device
,
dtype
=
a
.
dtype
)
if
hasattr
(
torch
.
ops
.
_C
,
"ggml_dequantize"
):
...
...
@@ -1664,67 +1664,66 @@ def scaled_fp4_experts_quant(
return
output
,
output_scales
# fp8
# def scaled_fp8_quant(
# input: torch.Tensor,
# scale: Optional[torch.Tensor] = None,
# num_token_padding: Optional[int] = None,
# scale_ub: Optional[torch.Tensor] = None,
# use_per_token_if_dynamic: bool = False,
# output: Optional[torch.Tensor] = None,
# ) -> tuple[torch.Tensor, torch.Tensor]:
# """
# Quantize input tensor to FP8 and return quantized tensor and scale.
# This function supports both static and dynamic quantization: If you
# provide the scale, it will use static scaling and if you omit it,
# the scale will be determined dynamically. The function also allows
# optional padding of the output tensors for downstream kernels that
# will benefit from padding.
# Args:
# input: The input tensor to be quantized to FP8
# scale: Optional scaling factor for the FP8 quantization
# scale_ub: Optional upper bound for scaling factor in dynamic
# per token case
# num_token_padding: If specified, pad the first dimension
# of the output to at least this value.
# use_per_token_if_dynamic: Whether to do per_tensor or per_token
# in the dynamic quantization case.
# Returns:
# tuple[torch.Tensor, torch.Tensor]: The output tensor in FP8 and
# scaling factor.
# """
# # This code assumes batch_dim and num_tokens are flattened
# assert (input.ndim == 2)
# shape: Union[tuple[int, int], torch.Size] = input.shape
# # For ROCm on MI300, the output fp8 dtype is torch.float_e3m3fnuz
# out_dtype: torch.dtype = current_platform.fp8_dtype()
# if num_token_padding:
# shape = (max(num_token_padding, input.shape[0]), shape[1])
# if output is None:
# output = torch.empty(shape, device=input.device, dtype=out_dtype)
# else:
# assert num_token_padding is None, \
# "padding not supported if output passed in"
# assert output.dtype == out_dtype
# if scale is None:
# if use_per_token_if_dynamic:
# scale = torch.empty((shape[0], 1),
# device=input.device,
# dtype=torch.float32)
# torch.ops._C.dynamic_per_token_scaled_fp8_quant(
# output, input, scale, scale_ub)
# else:
# scale = torch.empty(1, device=input.device, dtype=torch.float32)
# torch.ops._C.dynamic_scaled_fp8_quant(output, input, scale)
# else:
# assert scale.numel() == 1, f"{scale.shape}"
# torch.ops._C.static_scaled_fp8_quant(output, input, scale)
# return output, scale
def
scaled_fp8_quant
(
input
:
torch
.
Tensor
,
scale
:
Optional
[
torch
.
Tensor
]
=
None
,
num_token_padding
:
Optional
[
int
]
=
None
,
scale_ub
:
Optional
[
torch
.
Tensor
]
=
None
,
use_per_token_if_dynamic
:
bool
=
False
,
output
:
Optional
[
torch
.
Tensor
]
=
None
,
)
->
tuple
[
torch
.
Tensor
,
torch
.
Tensor
]:
"""
Quantize input tensor to FP8 and return quantized tensor and scale.
This function supports both static and dynamic quantization: If you
provide the scale, it will use static scaling and if you omit it,
the scale will be determined dynamically. The function also allows
optional padding of the output tensors for downstream kernels that
will benefit from padding.
Args:
input: The input tensor to be quantized to FP8
scale: Optional scaling factor for the FP8 quantization
scale_ub: Optional upper bound for scaling factor in dynamic
per token case
num_token_padding: If specified, pad the first dimension
of the output to at least this value.
use_per_token_if_dynamic: Whether to do per_tensor or per_token
in the dynamic quantization case.
Returns:
tuple[torch.Tensor, torch.Tensor]: The output tensor in FP8 and
scaling factor.
"""
# This code assumes batch_dim and num_tokens are flattened
assert
(
input
.
ndim
==
2
)
shape
:
Union
[
tuple
[
int
,
int
],
torch
.
Size
]
=
input
.
shape
# For ROCm on MI300, the output fp8 dtype is torch.float_e3m3fnuz
out_dtype
:
torch
.
dtype
=
current_platform
.
fp8_dtype
()
if
num_token_padding
:
shape
=
(
max
(
num_token_padding
,
input
.
shape
[
0
]),
shape
[
1
])
if
output
is
None
:
output
=
torch
.
empty
(
shape
,
device
=
input
.
device
,
dtype
=
out_dtype
)
else
:
assert
num_token_padding
is
None
,
\
"padding not supported if output passed in"
assert
output
.
dtype
==
out_dtype
if
scale
is
None
:
if
use_per_token_if_dynamic
:
scale
=
torch
.
empty
((
shape
[
0
],
1
),
device
=
input
.
device
,
dtype
=
torch
.
float32
)
torch
.
ops
.
_C
.
dynamic_per_token_scaled_fp8_quant
(
output
,
input
,
scale
,
scale_ub
)
else
:
scale
=
torch
.
empty
(
1
,
device
=
input
.
device
,
dtype
=
torch
.
float32
)
torch
.
ops
.
_C
.
dynamic_scaled_fp8_quant
(
output
,
input
,
scale
)
else
:
assert
scale
.
numel
()
==
1
,
f
"
{
scale
.
shape
}
"
torch
.
ops
.
_C
.
static_scaled_fp8_quant
(
output
,
input
,
scale
)
return
output
,
scale
# gptq allspark
...
...
vllm/compilation/activation_quant_fusion.py
View file @
98a011e9
...
...
@@ -26,14 +26,14 @@ FP4_DTYPE = torch.uint8
SILU_MUL_OP
=
torch
.
ops
.
_C
.
silu_and_mul
.
default
#
FUSED_OPS: dict[QuantKey, OpOverload] = {
#
kFp8StaticTensorSym: torch.ops._C.silu_and_mul_quant.default, # noqa: E501
#
}
#
silu_and_mul_nvfp4_quant_supported = (current_platform.is_cuda() and hasattr(
#
torch.ops._C, "silu_and_mul_nvfp4_quant"))
#
if silu_and_mul_nvfp4_quant_supported:
#
FUSED_OPS[
#
kNvfp4Quant] = torch.ops._C.silu_and_mul_nvfp4_quant.default # noqa: E501
FUSED_OPS
:
dict
[
QuantKey
,
OpOverload
]
=
{
kFp8StaticTensorSym
:
torch
.
ops
.
_C
.
silu_and_mul_quant
.
default
,
# noqa: E501
}
silu_and_mul_nvfp4_quant_supported
=
(
current_platform
.
is_cuda
()
and
hasattr
(
torch
.
ops
.
_C
,
"silu_and_mul_nvfp4_quant"
))
if
silu_and_mul_nvfp4_quant_supported
:
FUSED_OPS
[
kNvfp4Quant
]
=
torch
.
ops
.
_C
.
silu_and_mul_nvfp4_quant
.
default
# noqa: E501
class
ActivationQuantPattern
(
ABC
):
...
...
vllm/compilation/fix_functionalization.py
View file @
98a011e9
...
...
@@ -68,15 +68,15 @@ class FixFunctionalizationPass(VllmInductorPass):
elif
at_target
==
torch
.
ops
.
_C
.
fused_add_rms_norm
.
default
:
mutated_args
=
{
1
:
'input'
,
2
:
'residual'
}
self
.
defunctionalize
(
graph
,
node
,
mutated_args
)
#
elif at_target == torch.ops._C.fused_add_rms_norm_static_fp8_quant.default: # noqa: E501
#
mutated_args = {1: 'result', 2: 'residual'}
#
self.defunctionalize(graph, node, mutated_args)
elif
at_target
==
torch
.
ops
.
_C
.
fused_add_rms_norm_static_fp8_quant
.
default
:
# noqa: E501
mutated_args
=
{
1
:
'result'
,
2
:
'residual'
}
self
.
defunctionalize
(
graph
,
node
,
mutated_args
)
elif
at_target
==
torch
.
ops
.
_C
.
rms_norm_dynamic_per_token_quant
.
default
:
# noqa: E501
mutated_args
=
{
1
:
'result'
,
2
:
'scale'
,
3
:
'residual'
}
self
.
defunctionalize
(
graph
,
node
,
mutated_args
)
elif
at_target
in
[
torch
.
ops
.
_C
.
rms_norm
.
default
,
#
torch.ops._C.rms_norm_static_fp8_quant.default,
torch
.
ops
.
_C
.
rms_norm_static_fp8_quant
.
default
,
]:
mutated_args
=
{
1
:
'result'
}
self
.
defunctionalize
(
graph
,
node
,
mutated_args
)
...
...
@@ -89,12 +89,12 @@ class FixFunctionalizationPass(VllmInductorPass):
node
,
mutated_args
,
args
=
(
'result'
,
'input'
))
#
elif at_target == torch.ops._C.silu_and_mul_quant.default:
#
mutated_args = {1: 'result'}
#
self.defunctionalize(graph,
#
node,
#
mutated_args,
#
args=('result', 'input', 'scale'))
elif
at_target
==
torch
.
ops
.
_C
.
silu_and_mul_quant
.
default
:
mutated_args
=
{
1
:
'result'
}
self
.
defunctionalize
(
graph
,
node
,
mutated_args
,
args
=
(
'result'
,
'input'
,
'scale'
))
# elif hasattr(
# torch.ops._C, "silu_and_mul_nvfp4_quant"
# ) and at_target == torch.ops._C.silu_and_mul_nvfp4_quant.default:
...
...
vllm/compilation/fusion.py
View file @
98a011e9
...
...
@@ -40,12 +40,12 @@ RMS_OP = torch.ops._C.rms_norm.default
RMS_ADD_OP
=
torch
.
ops
.
_C
.
fused_add_rms_norm
.
default
QUANT_OPS
:
dict
[
QuantKey
,
OpOverload
]
=
{
#
kFp8StaticTensorSym:
#
torch.ops._C.static_scaled_fp8_quant.default, # noqa: E501
#
kFp8DynamicTensorSym:
#
torch.ops._C.dynamic_scaled_fp8_quant.default, # noqa: E501
#
kFp8DynamicTokenSym:
#
torch.ops._C.dynamic_per_token_scaled_fp8_quant.default, # noqa: E501
kFp8StaticTensorSym
:
torch
.
ops
.
_C
.
static_scaled_fp8_quant
.
default
,
# noqa: E501
kFp8DynamicTensorSym
:
torch
.
ops
.
_C
.
dynamic_scaled_fp8_quant
.
default
,
# noqa: E501
kFp8DynamicTokenSym
:
torch
.
ops
.
_C
.
dynamic_per_token_scaled_fp8_quant
.
default
,
# noqa: E501
}
if
current_platform
.
is_cuda
()
and
hasattr
(
torch
.
ops
.
_C
,
"scaled_fp4_quant"
):
QUANT_OPS
[
kNvfp4Quant
]
=
torch
.
ops
.
_C
.
scaled_fp4_quant
.
default
...
...
@@ -66,14 +66,14 @@ class FusedRMSQuantKey(NamedTuple):
FUSED_OPS
:
dict
[
FusedRMSQuantKey
,
OpOverload
]
=
{
#
FusedRMSQuantKey(kFp8StaticTensorSym, False):
#
torch.ops._C.rms_norm_static_fp8_quant.default, # noqa: E501
#
FusedRMSQuantKey(kFp8StaticTensorSym, True):
#
torch.ops._C.fused_add_rms_norm_static_fp8_quant.default, # noqa: E501
#
FusedRMSQuantKey(kFp8DynamicTokenSym, False):
#
torch.ops._C.rms_norm_dynamic_per_token_quant.default, # noqa: E501
#
FusedRMSQuantKey(kFp8DynamicTokenSym, True):
#
torch.ops._C.rms_norm_dynamic_per_token_quant.default, # noqa: E501
FusedRMSQuantKey
(
kFp8StaticTensorSym
,
False
):
torch
.
ops
.
_C
.
rms_norm_static_fp8_quant
.
default
,
# noqa: E501
FusedRMSQuantKey
(
kFp8StaticTensorSym
,
True
):
torch
.
ops
.
_C
.
fused_add_rms_norm_static_fp8_quant
.
default
,
# noqa: E501
FusedRMSQuantKey
(
kFp8DynamicTokenSym
,
False
):
torch
.
ops
.
_C
.
rms_norm_dynamic_per_token_quant
.
default
,
# noqa: E501
FusedRMSQuantKey
(
kFp8DynamicTokenSym
,
True
):
torch
.
ops
.
_C
.
rms_norm_dynamic_per_token_quant
.
default
,
# noqa: E501
}
...
...
@@ -351,22 +351,22 @@ class RMSNormQuantFusionPass(VllmPatternMatcherPass):
self
.
patterns
:
PatternMatcherPass
=
PatternMatcherPass
(
pass_name
=
"rmsnorm_quant_fusion_pass"
)
#
for epsilon in [1e-5, 1e-6]:
for
epsilon
in
[
1e-5
,
1e-6
]:
# Fuse rms_norm + static fp8 quant
#
RMSNormStaticQuantPattern(epsilon,
#
FP8_DTYPE).register(self.patterns)
RMSNormStaticQuantPattern
(
epsilon
,
FP8_DTYPE
).
register
(
self
.
patterns
)
# Fuse fused_add_rms_norm + static fp8 quant
#
FusedAddRMSNormStaticQuantPattern(epsilon, FP8_DTYPE).register(
#
self.patterns)
FusedAddRMSNormStaticQuantPattern
(
epsilon
,
FP8_DTYPE
).
register
(
self
.
patterns
)
# # Fuse rms_norm + dynamic per-token fp8 quant
#
RMSNormDynamicQuantPattern(epsilon,
#
FP8_DTYPE).register(self.patterns)
RMSNormDynamicQuantPattern
(
epsilon
,
FP8_DTYPE
).
register
(
self
.
patterns
)
# # Fuse fused_add_rms_norm + dynamic per-token fp8 quant
#
FusedAddRMSNormDynamicQuantPattern(epsilon, FP8_DTYPE).register(
#
self.patterns)
FusedAddRMSNormDynamicQuantPattern
(
epsilon
,
FP8_DTYPE
).
register
(
self
.
patterns
)
self
.
dump_patterns
(
config
,
self
.
patterns
)
...
...
vllm/compilation/sequence_parallelism.py
View file @
98a011e9
...
...
@@ -446,16 +446,16 @@ class SequenceParallelismPass(VllmPatternMatcherPass):
for
epsilon
in
[
1e-5
,
1e-6
]:
# RMSNorm + Static FP8 quantization patterns
#
fp8_quant_op = torch.ops._C.static_scaled_fp8_quant.default
#
FirstAllReduceRMSNormStaticFP8Pattern(
#
epsilon, self.model_dtype, self.device,
#
fp8_quant_op).register(self.patterns)
#
MiddleAllReduceRMSNormStaticFP8Pattern(
#
epsilon, self.model_dtype, self.device,
#
fp8_quant_op).register(self.patterns)
#
LastAllReduceRMSNormStaticFP8Pattern(
#
epsilon, self.model_dtype, self.device,
#
fp8_quant_op).register(self.patterns)
fp8_quant_op
=
torch
.
ops
.
_C
.
static_scaled_fp8_quant
.
default
FirstAllReduceRMSNormStaticFP8Pattern
(
epsilon
,
self
.
model_dtype
,
self
.
device
,
fp8_quant_op
).
register
(
self
.
patterns
)
MiddleAllReduceRMSNormStaticFP8Pattern
(
epsilon
,
self
.
model_dtype
,
self
.
device
,
fp8_quant_op
).
register
(
self
.
patterns
)
LastAllReduceRMSNormStaticFP8Pattern
(
epsilon
,
self
.
model_dtype
,
self
.
device
,
fp8_quant_op
).
register
(
self
.
patterns
)
# Normal RMSNorm patterns
FirstAllReduceRMSNormPattern
(
epsilon
,
self
.
model_dtype
,
...
...
vllm/envs.py
View file @
98a011e9
...
...
@@ -214,7 +214,6 @@ if TYPE_CHECKING:
VLLM_USE_OPT_OP
:
bool
=
False
VLLM_USE_TC_PAGED_ATTN
:
bool
=
False
VLLM_USE_PA_PRINT_PARAM
:
bool
=
False
VLLM_TREE_DECODING
:
bool
=
False
VLLM_SPEC_DECODE_EAGER
:
bool
=
False
VLLM_PCIE_USE_CUSTOM_ALLREDUCE
:
bool
=
False
VLLM_CUSTOM_ALLREDUCE_SUPPORTED_WORLDSIZE_MAX
:
int
=
16
...
...
@@ -1545,12 +1544,6 @@ environment_variables: dict[str, Callable[[], Any]] = {
lambda
:
(
os
.
environ
.
get
(
"VLLM_USE_PA_PRINT_PARAM"
,
"False"
).
lower
()
in
(
"true"
,
"1"
)),
# If set, vLLM will use tree-style speculative decoding.
"VLLM_TREE_DECODING"
:
lambda
:
(
os
.
environ
.
get
(
"VLLM_TREE_DECODING"
,
"0"
).
strip
().
lower
()
in
(
"1"
,
"true"
)),
# If set, vLLM will disable the draft model in cudagraph mode.
"VLLM_SPEC_DECODE_EAGER"
:
lambda
:
bool
(
int
(
os
.
getenv
(
"VLLM_SPEC_DECODE_EAGER"
,
"0"
))),
...
...
vllm/v1/engine/llm_engine.py
View file @
98a011e9
...
...
@@ -140,8 +140,6 @@ class LLMEngine:
# Don't keep the dummy data in memory
self
.
reset_mm_cache
()
# self.tree_decoding = os.environ.get('VLLM_TREE_DECODING') == '1'
@
classmethod
def
from_vllm_config
(
...
...
vllm/worker/worker_base.py
View file @
98a011e9
...
...
@@ -52,8 +52,6 @@ class WorkerBase:
different hardware. Also abstracts control plane communication, e.g., to
communicate request metadata to other workers.
"""
# TODO
tree_decoding
=
(
os
.
environ
.
get
(
'VLLM_TREE_DECODING'
)
==
'1'
)
def
__init__
(
self
,
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment