Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
53076d70
Commit
53076d70
authored
Mar 24, 2025
by
zhuwenwen
Browse files
Merge tag 'v0.8.2' into v0.8.2-ori
parents
322a0be6
9c5c81b0
Changes
219
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
621 additions
and
246 deletions
+621
-246
vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json
...name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json
+146
-0
vllm/model_executor/layers/layernorm.py
vllm/model_executor/layers/layernorm.py
+77
-17
vllm/model_executor/layers/mamba/mamba_mixer2.py
vllm/model_executor/layers/mamba/mamba_mixer2.py
+28
-22
vllm/model_executor/layers/quantization/kv_cache.py
vllm/model_executor/layers/quantization/kv_cache.py
+13
-3
vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json
...name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json
+26
-0
vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json
...name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json
+26
-0
vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json
...name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json
+26
-0
vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json
...name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json
+26
-0
vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json
...name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json
+26
-0
vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json
...name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json
+26
-0
vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json
...name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json
+26
-0
vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json
...name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json
+26
-0
vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json
...name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json
+18
-0
vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json
...name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json
+26
-0
vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json
...name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json
+26
-0
vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json
...name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json
+26
-0
vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json
...name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json
+26
-0
vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json
...name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json
+26
-0
vllm/model_executor/model_loader/openvino.py
vllm/model_executor/model_loader/openvino.py
+0
-204
vllm/model_executor/models/deepseek_v2.py
vllm/model_executor/models/deepseek_v2.py
+1
-0
No files found.
vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json
0 → 100644
View file @
53076d70
{
"1"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
128
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
8
,
"num_stages"
:
3
},
"2"
:
{
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
128
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
4
,
"num_stages"
:
3
},
"4"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
128
,
"GROUP_SIZE_M"
:
64
,
"num_warps"
:
8
,
"num_stages"
:
2
},
"8"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
256
,
"GROUP_SIZE_M"
:
32
,
"num_warps"
:
8
,
"num_stages"
:
2
},
"16"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
256
,
"GROUP_SIZE_M"
:
16
,
"num_warps"
:
8
,
"num_stages"
:
2
},
"24"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
256
,
"GROUP_SIZE_M"
:
16
,
"num_warps"
:
8
,
"num_stages"
:
2
},
"32"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
256
,
"GROUP_SIZE_M"
:
16
,
"num_warps"
:
8
,
"num_stages"
:
2
},
"48"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
256
,
"GROUP_SIZE_M"
:
16
,
"num_warps"
:
8
,
"num_stages"
:
2
},
"64"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
256
,
"GROUP_SIZE_M"
:
64
,
"num_warps"
:
8
,
"num_stages"
:
2
},
"96"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
256
,
"GROUP_SIZE_M"
:
16
,
"num_warps"
:
8
,
"num_stages"
:
2
},
"128"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
256
,
"GROUP_SIZE_M"
:
16
,
"num_warps"
:
8
,
"num_stages"
:
2
},
"256"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
256
,
"GROUP_SIZE_M"
:
32
,
"num_warps"
:
8
,
"num_stages"
:
2
},
"512"
:
{
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
256
,
"GROUP_SIZE_M"
:
32
,
"num_warps"
:
8
,
"num_stages"
:
2
},
"1024"
:
{
"BLOCK_SIZE_M"
:
64
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
128
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
4
,
"num_stages"
:
2
},
"1536"
:
{
"BLOCK_SIZE_M"
:
64
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
256
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
8
,
"num_stages"
:
2
},
"2048"
:
{
"BLOCK_SIZE_M"
:
64
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
128
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
4
,
"num_stages"
:
2
},
"3072"
:
{
"BLOCK_SIZE_M"
:
64
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
128
,
"GROUP_SIZE_M"
:
16
,
"num_warps"
:
4
,
"num_stages"
:
2
},
"4096"
:
{
"BLOCK_SIZE_M"
:
64
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
128
,
"GROUP_SIZE_M"
:
16
,
"num_warps"
:
4
,
"num_stages"
:
2
}
}
vllm/model_executor/layers/layernorm.py
View file @
53076d70
...
...
@@ -5,7 +5,77 @@ from typing import Optional, Tuple, Union
import
torch
import
torch.nn
as
nn
import
vllm.envs
as
envs
from
vllm.model_executor.custom_op
import
CustomOp
from
vllm.platforms
import
current_platform
def
is_rocm_aiter_rmsnorm_enabled
()
->
bool
:
return
current_platform
.
is_rocm
()
\
and
envs
.
VLLM_ROCM_USE_AITER_RMSNORM
\
and
envs
.
VLLM_ROCM_USE_AITER
def
rms_norm
(
x
:
torch
.
Tensor
,
weight
:
torch
.
Tensor
,
variance_epsilon
:
float
)
->
torch
.
Tensor
:
from
vllm
import
_custom_ops
as
ops
out
=
torch
.
empty_like
(
x
)
ops
.
rms_norm
(
out
,
x
,
weight
,
variance_epsilon
,
)
return
out
def
fused_add_rms_norm
(
x
:
torch
.
Tensor
,
residual
:
torch
.
Tensor
,
weight
:
torch
.
Tensor
,
variance_epsilon
:
float
)
->
Tuple
[
torch
.
Tensor
,
torch
.
Tensor
]:
from
vllm
import
_custom_ops
as
ops
ops
.
fused_add_rms_norm
(
x
,
residual
,
weight
,
variance_epsilon
,
)
return
x
,
residual
def
rocm_aiter_rms_norm
(
x
:
torch
.
Tensor
,
weight
:
torch
.
Tensor
,
variance_epsilon
:
float
)
->
torch
.
Tensor
:
import
aiter
as
rocm_aiter
return
rocm_aiter
.
rms_norm
(
x
,
weight
,
variance_epsilon
)
def
rocm_aiter_fused_add_rms_norm
(
x
:
torch
.
Tensor
,
residual
:
torch
.
Tensor
,
weight
:
torch
.
Tensor
,
variance_epsilon
:
float
)
->
Tuple
[
torch
.
Tensor
,
torch
.
Tensor
]:
import
aiter
as
rocm_aiter
# Assuming the correct signature for rmsnorm2d_fwd_with_add
rocm_aiter
.
rmsnorm2d_fwd_with_add
(
x
,
# output
x
,
# input
residual
,
# residual input
residual
,
# residual output
weight
,
variance_epsilon
,
)
return
x
,
residual
def
dispatch_cuda_rmsnorm_func
(
add_residual
:
bool
):
if
add_residual
:
if
is_rocm_aiter_rmsnorm_enabled
():
return
rocm_aiter_fused_add_rms_norm
return
fused_add_rms_norm
if
is_rocm_aiter_rmsnorm_enabled
():
return
rocm_aiter_rms_norm
return
rms_norm
@
CustomOp
.
register
(
"rms_norm"
)
...
...
@@ -81,24 +151,14 @@ class RMSNorm(CustomOp):
if
self
.
variance_size_override
is
not
None
:
return
self
.
forward_native
(
x
,
residual
)
from
vllm
import
_custom_ops
as
ops
add_residual
=
residual
is
not
None
norm_func
=
dispatch_cuda_rmsnorm_func
(
add_residual
)
if
residual
is
not
None
:
ops
.
fused_add_rms_norm
(
x
,
residual
,
self
.
weight
.
data
,
self
.
variance_epsilon
,
)
return
x
,
residual
out
=
torch
.
empty_like
(
x
)
ops
.
rms_norm
(
out
,
x
,
self
.
weight
.
data
,
self
.
variance_epsilon
,
)
return
out
if
add_residual
:
return
norm_func
(
x
,
residual
,
self
.
weight
.
data
,
self
.
variance_epsilon
)
else
:
return
norm_func
(
x
,
self
.
weight
.
data
,
self
.
variance_epsilon
)
def
forward_hpu
(
self
,
...
...
vllm/model_executor/layers/mamba/mamba_mixer2.py
View file @
53076d70
...
...
@@ -251,6 +251,9 @@ class MambaMixer2(CustomOp):
"then num_groups must equal 1."
)
assert
self
.
tp_size
==
1
or
quant_config
is
None
,
\
"Tensor parallel currently not supported for quantized models."
self
.
ssm_state_size
=
ssm_state_size
self
.
activation
=
activation
...
...
@@ -331,22 +334,24 @@ class MambaMixer2(CustomOp):
],
self
.
tp_size
,
tp_rank
)
})
delattr
(
self
.
in_proj
.
weight
,
"weight_loader"
)
set_weight_attrs
(
self
.
in_proj
.
weight
,
{
"weight_loader"
:
mamba_v2_sharded_weight_loader
(
[
intermediate_settings
,
# for gate
intermediate_settings
,
group_shard_settings
,
group_shard_settings
,
head_setings
,
# for dt
],
self
.
tp_size
,
tp_rank
)
})
if
quant_config
is
None
:
# - quant layers do not have a weight loader
delattr
(
self
.
in_proj
.
weight
,
"weight_loader"
)
set_weight_attrs
(
self
.
in_proj
.
weight
,
{
"weight_loader"
:
mamba_v2_sharded_weight_loader
(
[
intermediate_settings
,
# for gate
intermediate_settings
,
group_shard_settings
,
group_shard_settings
,
head_setings
,
# for dt
],
self
.
tp_size
,
tp_rank
)
})
# - these are TPed by heads to reduce the size of the
# temporal shape
...
...
@@ -465,10 +470,11 @@ class MambaMixer2(CustomOp):
if
has_prefill
:
initial_states
=
None
if
has_initial_states
is
not
None
and
any
(
has_initial_states
):
for
idx
in
mamba_cache_params
.
state_indices_tensor
[
~
has_initial_states
]:
mamba_cache_params
.
ssm_state
[
idx
].
zero_
()
if
has_initial_states
is
not
None
and
torch
.
any
(
has_initial_states
):
zero_init_indices
=
mamba_cache_params
.
state_indices_tensor
[
~
has_initial_states
]
mamba_cache_params
.
ssm_state
[
zero_init_indices
]
=
0
initial_states
=
mamba_cache_params
.
ssm_state
[
mamba_cache_params
.
state_indices_tensor
]
...
...
@@ -494,8 +500,8 @@ class MambaMixer2(CustomOp):
# update ssm states
# - varlen state is a (batch, nheads, headdim, dstate) tensor
for
i
,
idx
in
enumerate
(
mamba_cache_params
.
state_indices_tensor
):
mamba_cache_params
.
s
sm_state
[
idx
].
copy_
(
varlen_state
[
i
])
mamba_cache_params
.
ssm_state
[
mamba_cache_params
.
s
tate_indices_tensor
]
=
varlen_state
# - reshape
hidden_states
=
scan_output
.
view
(
seq_len
,
-
1
)
...
...
vllm/model_executor/layers/quantization/kv_cache.py
View file @
53076d70
...
...
@@ -26,11 +26,14 @@ class BaseKVCacheMethod(QuantizeMethodBase):
def
create_weights
(
self
,
layer
:
torch
.
nn
.
Module
):
"""
Create "weight" (aka k_scale and v_scale) for an attention layer.
Create "weight" (aka q_scale, k_scale and v_scale)
for an attention layer.
"""
# Initialize the KV cache scales to -1.0,
which is
an invalid value.
# If the k/v_scale appear
s
in the checkpoint, it will be
# Initialize the
Q and
KV cache scales to -1.0, an invalid value.
# If the
q and
k/v_scale
s
appear in the checkpoint, it will be
# overwritten when loading weights.
layer
.
q_scale
=
torch
.
nn
.
Parameter
(
torch
.
tensor
(
-
1.0
),
requires_grad
=
False
)
layer
.
k_scale
=
torch
.
nn
.
Parameter
(
torch
.
tensor
(
-
1.0
),
requires_grad
=
False
)
layer
.
v_scale
=
torch
.
nn
.
Parameter
(
torch
.
tensor
(
-
1.0
),
...
...
@@ -75,6 +78,13 @@ class BaseKVCacheMethod(QuantizeMethodBase):
raise
ValueError
(
"Only support per-tensor scaling factor "
"for fp8 KV cache"
)
if
layer
.
q_scale
<
0.0
:
logger
.
warning_once
(
"Checkpoint does not provide a q scaling factor. "
"Setting it to k_scale. This only matters for "
"the flash-attn backend."
)
layer
.
_q_scale
.
copy_
(
k_scale
)
# These are used in the final Attention.forward()
layer
.
_k_scale
.
copy_
(
k_scale
)
layer
.
_v_scale
.
copy_
(
v_scale
)
...
...
vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json
0 → 100644
View file @
53076d70
{
"2048"
:
{
"BLOCK_SIZE_M"
:
64
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
128
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
4
,
"num_stages"
:
2
},
"3072"
:
{
"BLOCK_SIZE_M"
:
64
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
128
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
4
,
"num_stages"
:
2
},
"4096"
:
{
"BLOCK_SIZE_M"
:
64
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
128
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
4
,
"num_stages"
:
2
}
}
vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json
0 → 100644
View file @
53076d70
{
"2048"
:
{
"BLOCK_SIZE_M"
:
128
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
16
,
"num_warps"
:
4
,
"num_stages"
:
2
},
"3072"
:
{
"BLOCK_SIZE_M"
:
128
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
64
,
"num_warps"
:
4
,
"num_stages"
:
2
},
"4096"
:
{
"BLOCK_SIZE_M"
:
128
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
64
,
"num_warps"
:
4
,
"num_stages"
:
2
}
}
vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json
0 → 100644
View file @
53076d70
{
"2048"
:
{
"BLOCK_SIZE_M"
:
64
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
128
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
4
,
"num_stages"
:
2
},
"3072"
:
{
"BLOCK_SIZE_M"
:
128
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
4
,
"num_stages"
:
3
},
"4096"
:
{
"BLOCK_SIZE_M"
:
128
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
4
,
"num_stages"
:
2
}
}
vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json
0 → 100644
View file @
53076d70
{
"2048"
:
{
"BLOCK_SIZE_M"
:
64
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
128
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
4
,
"num_stages"
:
2
},
"3072"
:
{
"BLOCK_SIZE_M"
:
128
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
4
,
"num_stages"
:
3
},
"4096"
:
{
"BLOCK_SIZE_M"
:
128
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
4
,
"num_stages"
:
2
}
}
vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json
0 → 100644
View file @
53076d70
{
"2048"
:
{
"BLOCK_SIZE_M"
:
128
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
4
,
"num_stages"
:
2
},
"3072"
:
{
"BLOCK_SIZE_M"
:
128
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
4
,
"num_stages"
:
2
},
"4096"
:
{
"BLOCK_SIZE_M"
:
128
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
4
,
"num_stages"
:
2
}
}
vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json
0 → 100644
View file @
53076d70
{
"2048"
:
{
"BLOCK_SIZE_M"
:
128
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
32
,
"num_warps"
:
4
,
"num_stages"
:
2
},
"3072"
:
{
"BLOCK_SIZE_M"
:
128
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
64
,
"num_warps"
:
4
,
"num_stages"
:
3
},
"4096"
:
{
"BLOCK_SIZE_M"
:
128
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
64
,
"num_warps"
:
4
,
"num_stages"
:
2
}
}
vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json
0 → 100644
View file @
53076d70
{
"2048"
:
{
"BLOCK_SIZE_M"
:
128
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
64
,
"num_warps"
:
4
,
"num_stages"
:
3
},
"3072"
:
{
"BLOCK_SIZE_M"
:
128
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
4
,
"num_stages"
:
2
},
"4096"
:
{
"BLOCK_SIZE_M"
:
128
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
4
,
"num_stages"
:
2
}
}
vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json
0 → 100644
View file @
53076d70
{
"2048"
:
{
"BLOCK_SIZE_M"
:
64
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
128
,
"GROUP_SIZE_M"
:
32
,
"num_warps"
:
4
,
"num_stages"
:
2
},
"3072"
:
{
"BLOCK_SIZE_M"
:
64
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
128
,
"GROUP_SIZE_M"
:
16
,
"num_warps"
:
4
,
"num_stages"
:
2
},
"4096"
:
{
"BLOCK_SIZE_M"
:
64
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
128
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
4
,
"num_stages"
:
2
}
}
vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json
0 → 100644
View file @
53076d70
{
"4"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
128
,
"GROUP_SIZE_M"
:
64
,
"num_warps"
:
4
,
"num_stages"
:
3
},
"8"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
128
,
"GROUP_SIZE_M"
:
32
,
"num_warps"
:
4
,
"num_stages"
:
4
}
}
vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json
0 → 100644
View file @
53076d70
{
"2048"
:
{
"BLOCK_SIZE_M"
:
128
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
64
,
"num_warps"
:
4
,
"num_stages"
:
2
},
"3072"
:
{
"BLOCK_SIZE_M"
:
128
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
64
,
"num_warps"
:
4
,
"num_stages"
:
3
},
"4096"
:
{
"BLOCK_SIZE_M"
:
128
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
64
,
"num_warps"
:
4
,
"num_stages"
:
2
}
}
vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json
0 → 100644
View file @
53076d70
{
"2048"
:
{
"BLOCK_SIZE_M"
:
128
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
64
,
"num_warps"
:
4
,
"num_stages"
:
2
},
"3072"
:
{
"BLOCK_SIZE_M"
:
128
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
64
,
"num_warps"
:
4
,
"num_stages"
:
3
},
"4096"
:
{
"BLOCK_SIZE_M"
:
128
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
64
,
"num_warps"
:
4
,
"num_stages"
:
2
}
}
vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json
0 → 100644
View file @
53076d70
{
"2048"
:
{
"BLOCK_SIZE_M"
:
128
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
64
,
"num_warps"
:
4
,
"num_stages"
:
2
},
"3072"
:
{
"BLOCK_SIZE_M"
:
128
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
64
,
"num_warps"
:
4
,
"num_stages"
:
2
},
"4096"
:
{
"BLOCK_SIZE_M"
:
128
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
4
,
"num_stages"
:
2
}
}
vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json
0 → 100644
View file @
53076d70
{
"2048"
:
{
"BLOCK_SIZE_M"
:
128
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
64
,
"num_warps"
:
4
,
"num_stages"
:
2
},
"3072"
:
{
"BLOCK_SIZE_M"
:
128
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
4
,
"num_stages"
:
2
},
"4096"
:
{
"BLOCK_SIZE_M"
:
128
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
4
,
"num_stages"
:
2
}
}
vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json
0 → 100644
View file @
53076d70
{
"2048"
:
{
"BLOCK_SIZE_M"
:
128
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
64
,
"num_warps"
:
4
,
"num_stages"
:
2
},
"3072"
:
{
"BLOCK_SIZE_M"
:
128
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
32
,
"num_warps"
:
4
,
"num_stages"
:
3
},
"4096"
:
{
"BLOCK_SIZE_M"
:
128
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
64
,
"num_warps"
:
4
,
"num_stages"
:
3
}
}
vllm/model_executor/model_loader/openvino.py
deleted
100644 → 0
View file @
322a0be6
# SPDX-License-Identifier: Apache-2.0
# ruff: noqa: SIM117
from
pathlib
import
Path
from
typing
import
Optional
import
openvino
as
ov
import
torch
from
huggingface_hub
import
HfApi
from
openvino._offline_transformations
import
paged_attention_transformation
from
optimum.intel
import
OVModelForCausalLM
from
torch
import
nn
import
vllm.envs
as
envs
from
vllm.config
import
ModelConfig
,
VllmConfig
,
set_current_vllm_config
from
vllm.forward_context
import
get_forward_context
from
vllm.logger
import
init_logger
from
vllm.model_executor.layers.logits_processor
import
(
LogitsProcessor
,
_prune_hidden_states
)
from
vllm.model_executor.layers.sampler
import
Sampler
,
SamplerOutput
from
vllm.model_executor.sampling_metadata
import
SamplingMetadata
from
vllm.platforms
import
current_platform
logger
=
init_logger
(
__name__
)
def
_flatten_inputs
(
inputs
):
"""
Helper function for making nested inputs flattens
"""
flatten_inputs
=
[]
for
input_data
in
inputs
:
if
input_data
is
None
:
continue
if
isinstance
(
input_data
,
(
list
,
tuple
)):
flatten_inputs
.
extend
(
_flatten_inputs
(
input_data
))
elif
isinstance
(
input_data
,
dict
):
flatten_inputs
.
extend
(
_flatten_inputs
(
list
(
input_data
.
values
())))
else
:
flatten_inputs
.
append
(
input_data
)
return
flatten_inputs
def
_modify_cache_parameters
(
model
:
ov
.
Model
,
kv_cache_dtype
:
ov
.
Type
,
is_cpu
:
bool
):
# Apply hardware dependent modifications to KV tensors
for
parameter
in
model
.
get_parameters
():
input
=
parameter
.
get_output_tensor
(
0
)
input_names
=
input
.
get_names
()
if
len
(
input_names
)
!=
1
:
continue
input_name
=
next
(
iter
(
input_names
))
shape
=
parameter
.
get_partial_shape
()
# use real block size if available, just a placeholder
# to provide the expected rank
num_blocks
=
ov
.
Dimension
()
block_size
=
ov
.
Dimension
()
head_size
=
ov
.
Dimension
()
if
input_name
.
startswith
(
"key_cache."
):
cpu_shape
=
[
num_blocks
,
shape
[
1
],
block_size
,
head_size
]
gpu_shape
=
[
num_blocks
,
shape
[
1
],
shape
[
2
],
block_size
]
elif
input_name
.
startswith
(
"value_cache."
):
cpu_shape
=
[
num_blocks
,
shape
[
1
],
block_size
,
head_size
]
gpu_shape
=
[
num_blocks
,
shape
[
1
],
block_size
,
shape
[
2
]]
else
:
continue
parameter
.
set_partial_shape
(
ov
.
PartialShape
(
cpu_shape
if
is_cpu
else
gpu_shape
))
parameter
.
set_element_type
(
kv_cache_dtype
)
model
.
validate_nodes_and_infer_types
()
def
_require_model_export
(
model_id
,
revision
=
None
,
subfolder
=
None
):
model_dir
=
Path
(
model_id
)
if
subfolder
is
not
None
:
model_dir
=
model_dir
/
subfolder
if
model_dir
.
is_dir
():
return
(
not
(
model_dir
/
"openvino_model.xml"
).
exists
()
or
not
(
model_dir
/
"openvino_model.bin"
).
exists
())
hf_api
=
HfApi
()
try
:
model_info
=
hf_api
.
model_info
(
model_id
,
revision
=
revision
or
"main"
)
normalized_subfolder
=
(
None
if
subfolder
is
None
else
Path
(
subfolder
).
as_posix
())
model_files
=
[
file
.
rfilename
for
file
in
model_info
.
siblings
if
normalized_subfolder
is
None
or
file
.
rfilename
.
startswith
(
normalized_subfolder
)
]
ov_model_path
=
(
"openvino_model.xml"
if
normalized_subfolder
is
None
else
f
"
{
normalized_subfolder
}
/openvino_model.xml"
)
return
(
ov_model_path
not
in
model_files
or
ov_model_path
.
replace
(
".xml"
,
".bin"
)
not
in
model_files
)
except
Exception
:
return
True
class
OpenVINOCausalLM
(
nn
.
Module
):
def
__init__
(
self
,
ov_core
:
ov
.
Core
,
model_config
:
ModelConfig
,
kv_cache_dtype
:
ov
.
Type
,
)
->
None
:
super
().
__init__
()
self
.
logits_processor
=
LogitsProcessor
(
model_config
.
hf_config
.
vocab_size
,
logits_as_input
=
True
)
self
.
sampler
=
Sampler
()
export
=
_require_model_export
(
model_config
.
model
)
if
export
:
logger
.
warning
(
f
"Provided model id
{
model_config
.
model
}
does not "
# noqa: G004
"contain OpenVINO IR, the model will be converted to IR with "
"default options. If you need to use specific options for "
"model conversion, use optimum-cli export openvino with "
"desired options."
)
else
:
logger
.
warning
(
"OpenVINO IR is available for provided model id "
# noqa: G004
f
"
{
model_config
.
model
}
. This IR will be used for inference "
"as-is, all possible options that may affect model conversion "
"are ignored."
)
load_in_8bit
=
(
envs
.
VLLM_OPENVINO_ENABLE_QUANTIZED_WEIGHTS
if
export
else
False
)
pt_model
=
OVModelForCausalLM
.
from_pretrained
(
model_config
.
model
,
export
=
export
,
compile
=
False
,
load_in_8bit
=
load_in_8bit
,
trust_remote_code
=
model_config
.
trust_remote_code
,
)
ov_device
=
envs
.
VLLM_OPENVINO_DEVICE
paged_attention_transformation
(
pt_model
.
model
)
_modify_cache_parameters
(
pt_model
.
model
,
kv_cache_dtype
,
current_platform
.
is_openvino_cpu
())
ov_compiled
=
ov_core
.
compile_model
(
pt_model
.
model
,
ov_device
)
self
.
ov_request
=
ov_compiled
.
create_infer_request
()
def
forward
(
self
,
input_ids
:
torch
.
Tensor
,
positions
:
torch
.
Tensor
,
kv_caches
:
list
[
tuple
[
ov
.
Tensor
,
ov
.
Tensor
]],
)
->
torch
.
Tensor
:
flat_kv_caches
=
_flatten_inputs
(
kv_caches
)
attn_metadata
=
get_forward_context
().
attn_metadata
inputs
=
[
input_ids
,
positions
,
*
flat_kv_caches
,
attn_metadata
.
past_lens
,
attn_metadata
.
subsequence_begins
,
attn_metadata
.
block_indices
,
attn_metadata
.
block_indices_begins
,
attn_metadata
.
max_context_len
,
]
self
.
ov_request
.
start_async
(
inputs
,
share_inputs
=
True
)
self
.
ov_request
.
wait
()
logits
=
torch
.
from_numpy
(
self
.
ov_request
.
get_tensor
(
"logits"
).
data
)
# TODO: remove 'view' once OpenVINO PA will drop 'seq_len' dimension
return
logits
.
view
(
-
1
,
logits
.
shape
[
-
1
])
def
compute_logits
(
self
,
hidden_states
:
torch
.
Tensor
,
sampling_metadata
:
SamplingMetadata
)
->
torch
.
Tensor
:
hidden_states
=
_prune_hidden_states
(
hidden_states
,
sampling_metadata
)
logits
=
self
.
logits_processor
(
None
,
hidden_states
,
sampling_metadata
)
return
logits
def
sample
(
self
,
logits
:
torch
.
Tensor
,
sampling_metadata
:
SamplingMetadata
,
)
->
Optional
[
SamplerOutput
]:
next_tokens
=
self
.
sampler
(
logits
,
sampling_metadata
)
return
next_tokens
def
get_model
(
vllm_config
:
VllmConfig
,
kv_cache_dtype
:
ov
.
Type
,
**
kwargs
,
)
->
torch
.
nn
.
Module
:
lora_config
=
kwargs
.
get
(
"lora_config"
)
ov_core
=
kwargs
.
get
(
"ov_core"
)
if
lora_config
:
raise
ValueError
(
"OpenVINO modeling does not support LoRA, "
"but LoRA is enabled. Support for this model may "
"be added in the future. If this is important to you, "
"please open an issue on github."
)
with
set_current_vllm_config
(
vllm_config
):
return
OpenVINOCausalLM
(
ov_core
,
vllm_config
.
model_config
,
kv_cache_dtype
)
vllm/model_executor/models/deepseek_v2.py
View file @
53076d70
...
...
@@ -589,6 +589,7 @@ class DeepseekV2Model(nn.Module):
model_config
=
vllm_config
.
model_config
cache_config
=
vllm_config
.
cache_config
quant_config
=
vllm_config
.
quant_config
self
.
config
=
config
self
.
vocab_size
=
config
.
vocab_size
...
...
Prev
1
…
4
5
6
7
8
9
10
11
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment