Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
a5a9263e
Commit
a5a9263e
authored
Jan 16, 2026
by
zhuwenwen
Browse files
Merge branch 'v0.9.2-dev' of
http://10.16.6.30/dcutoolkit/deeplearing/vllm
into v0.9.2-dev
parents
cabf690f
c99d07cc
Changes
7
Show whitespace changes
Inline
Side-by-side
Showing
7 changed files
with
21 additions
and
9 deletions
+21
-9
vllm/envs.py
vllm/envs.py
+9
-0
vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py
...ompressed_tensors/schemes/compressed_tensors_w8a8_int8.py
+3
-2
vllm/model_executor/layers/quantization/slimquant_w4a8.py
vllm/model_executor/layers/quantization/slimquant_w4a8.py
+2
-2
vllm/model_executor/models/falcon.py
vllm/model_executor/models/falcon.py
+2
-1
vllm/model_executor/models/glm4.py
vllm/model_executor/models/glm4.py
+2
-1
vllm/model_executor/models/qwen3_moe.py
vllm/model_executor/models/qwen3_moe.py
+1
-1
vllm/model_executor/models/telechat2.py
vllm/model_executor/models/telechat2.py
+2
-2
No files found.
vllm/envs.py
View file @
a5a9263e
...
@@ -207,6 +207,7 @@ if TYPE_CHECKING:
...
@@ -207,6 +207,7 @@ if TYPE_CHECKING:
VLLM_USE_FUSED_QA_KVA_GEMM
:
bool
=
False
VLLM_USE_FUSED_QA_KVA_GEMM
:
bool
=
False
VLLM_V1_FAST_TOKEN_ID_COPY
:
bool
=
False
VLLM_V1_FAST_TOKEN_ID_COPY
:
bool
=
False
VLLM_DISABLE_SHARED_EXPERTS_STREAM
:
bool
=
True
VLLM_DISABLE_SHARED_EXPERTS_STREAM
:
bool
=
True
VLLM_W8A8_BACKEND
:
int
=
3
def
get_default_cache_root
():
def
get_default_cache_root
():
return
os
.
getenv
(
return
os
.
getenv
(
...
@@ -1335,6 +1336,13 @@ environment_variables: dict[str, Callable[[], Any]] = {
...
@@ -1335,6 +1336,13 @@ environment_variables: dict[str, Callable[[], Any]] = {
int
(
os
.
getenv
(
"VLLM_DISABLE_SHARED_EXPERTS_STREAM"
,
"1"
))
int
(
os
.
getenv
(
"VLLM_DISABLE_SHARED_EXPERTS_STREAM"
,
"1"
))
),
),
# W8A8 GEMM backend selection for vLLM quantized models.
# lightop/triton: 1
# cutlass: 2 (will remove in the future)
# blaslt: 3 (default)
# rocblas: others
"VLLM_W8A8_BACKEND"
:
lambda
:
int
(
os
.
getenv
(
"VLLM_W8A8_BACKEND"
,
"3"
)),
}
}
# --8<-- [end:env-vars-definition]
# --8<-- [end:env-vars-definition]
...
@@ -1399,6 +1407,7 @@ def compute_hash() -> str:
...
@@ -1399,6 +1407,7 @@ def compute_hash() -> str:
"VLLM_DP_SIZE"
,
"VLLM_DP_SIZE"
,
"VLLM_USE_STANDALONE_COMPILE"
,
"VLLM_USE_STANDALONE_COMPILE"
,
"VLLM_FUSED_MOE_CHUNK_SIZE"
,
"VLLM_FUSED_MOE_CHUNK_SIZE"
,
"VLLM_W8A8_BACKEND"
,
]
]
for
key
in
environment_variables_to_hash
:
for
key
in
environment_variables_to_hash
:
if
key
in
environment_variables
:
if
key
in
environment_variables
:
...
...
vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py
View file @
a5a9263e
...
@@ -20,6 +20,7 @@ from vllm.model_executor.parameter import (BasevLLMParameter,
...
@@ -20,6 +20,7 @@ from vllm.model_executor.parameter import (BasevLLMParameter,
PerTensorScaleParameter
)
PerTensorScaleParameter
)
from
vllm.utils
import
W8a8GetCacheJSON
from
vllm.utils
import
W8a8GetCacheJSON
from
vllm
import
_custom_ops
as
ops
from
vllm
import
_custom_ops
as
ops
import
vllm.envs
as
envs
logger
=
init_logger
(
__name__
)
logger
=
init_logger
(
__name__
)
...
@@ -31,8 +32,8 @@ class CompressedTensorsW8A8Int8(CompressedTensorsScheme):
...
@@ -31,8 +32,8 @@ class CompressedTensorsW8A8Int8(CompressedTensorsScheme):
input_symmetric
:
bool
):
input_symmetric
:
bool
):
self
.
strategy
=
strategy
self
.
strategy
=
strategy
self
.
is_static_input_scheme
=
is_static_input_scheme
self
.
is_static_input_scheme
=
is_static_input_scheme
self
.
tritonsingleton
=
W8a8GetCacheJSON
()
self
.
tritonsingleton
=
W8a8GetCacheJSON
()
self
.
w8a8_strategy
=
int
(
os
.
getenv
(
'W8A8_SUPPORT_METHODS'
,
'1'
))
self
.
w8a8_strategy
=
envs
.
VLLM_W8A8_BACKEND
self
.
input_symmetric
=
input_symmetric
self
.
input_symmetric
=
input_symmetric
@
classmethod
@
classmethod
...
...
vllm/model_executor/layers/quantization/slimquant_w4a8.py
View file @
a5a9263e
...
@@ -92,8 +92,8 @@ class SlimQuantW4A8Int8LinearMethod(LinearMethodBase):
...
@@ -92,8 +92,8 @@ class SlimQuantW4A8Int8LinearMethod(LinearMethodBase):
def
__init__
(
self
,
quantization_config
:
SlimQuantW4A8Int8Config
):
def
__init__
(
self
,
quantization_config
:
SlimQuantW4A8Int8Config
):
self
.
quantization_config
=
quantization_config
self
.
quantization_config
=
quantization_config
self
.
tritonsingleton
=
W8a8GetCacheJSON
()
self
.
tritonsingleton
=
W8a8GetCacheJSON
()
self
.
w8a8_strategy
=
int
(
os
.
getenv
(
'W8A8_SUPPORT_METHODS'
,
'1'
))
self
.
w8a8_strategy
=
envs
.
VLLM_W8A8_BACKEND
def
process_weights_after_loading
(
self
,
layer
:
torch
.
nn
.
Module
)
->
None
:
def
process_weights_after_loading
(
self
,
layer
:
torch
.
nn
.
Module
)
->
None
:
n
=
layer
.
weight
.
shape
[
0
]
n
=
layer
.
weight
.
shape
[
0
]
...
...
vllm/model_executor/models/falcon.py
View file @
a5a9263e
...
@@ -58,6 +58,7 @@ from .utils import (AutoWeightsLoader, is_pp_missing_parameter,
...
@@ -58,6 +58,7 @@ from .utils import (AutoWeightsLoader, is_pp_missing_parameter,
from
vllm
import
_custom_ops
as
ops
from
vllm
import
_custom_ops
as
ops
from
vllm.model_executor.utils
import
pad_weight
,
gemm_bank_conf
from
vllm.model_executor.utils
import
pad_weight
,
gemm_bank_conf
import
vllm.envs
as
envs
FalconConfig
=
Union
[
HF_FalconConfig
,
RWConfig
]
FalconConfig
=
Union
[
HF_FalconConfig
,
RWConfig
]
...
@@ -393,7 +394,7 @@ class FalconModel(nn.Module):
...
@@ -393,7 +394,7 @@ class FalconModel(nn.Module):
self
.
use_gemm_pad
=
os
.
environ
.
get
(
'GEMM_PAD'
)
==
'1'
self
.
use_gemm_pad
=
os
.
environ
.
get
(
'GEMM_PAD'
)
==
'1'
self
.
use_fa_pad
=
os
.
environ
.
get
(
'FA_PAD'
)
==
'1'
self
.
use_fa_pad
=
os
.
environ
.
get
(
'FA_PAD'
)
==
'1'
self
.
use_awq_pad
=
os
.
environ
.
get
(
'AWQ_PAD'
)
==
'1'
self
.
use_awq_pad
=
os
.
environ
.
get
(
'AWQ_PAD'
)
==
'1'
self
.
w8a8_strategy
=
int
(
os
.
getenv
(
'W8A8_SUPPORT_METHODS'
,
'1'
))
self
.
w8a8_strategy
=
envs
.
VLLM_W8A8_BACKEND
def
get_input_embeddings
(
self
,
input_ids
:
torch
.
Tensor
)
->
torch
.
Tensor
:
def
get_input_embeddings
(
self
,
input_ids
:
torch
.
Tensor
)
->
torch
.
Tensor
:
return
self
.
word_embeddings
(
input_ids
)
return
self
.
word_embeddings
(
input_ids
)
...
...
vllm/model_executor/models/glm4.py
View file @
a5a9263e
...
@@ -31,6 +31,7 @@ from typing import Optional, Union
...
@@ -31,6 +31,7 @@ from typing import Optional, Union
import
torch
import
torch
from
torch
import
nn
from
torch
import
nn
from
transformers
import
Glm4Config
from
transformers
import
Glm4Config
import
vllm.envs
as
envs
class
MultiModalConfigProxy
:
class
MultiModalConfigProxy
:
...
@@ -332,7 +333,7 @@ class Glm4ForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
...
@@ -332,7 +333,7 @@ class Glm4ForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
self
.
use_gemm_pad
=
os
.
environ
.
get
(
'GEMM_PAD'
)
==
'1'
self
.
use_gemm_pad
=
os
.
environ
.
get
(
'GEMM_PAD'
)
==
'1'
self
.
use_fa_pad
=
os
.
environ
.
get
(
'FA_PAD'
)
==
'1'
self
.
use_fa_pad
=
os
.
environ
.
get
(
'FA_PAD'
)
==
'1'
self
.
use_awq_pad
=
os
.
environ
.
get
(
'AWQ_PAD'
)
==
'1'
self
.
use_awq_pad
=
os
.
environ
.
get
(
'AWQ_PAD'
)
==
'1'
self
.
w8a8_strategy
=
int
(
os
.
getenv
(
'W8A8_SUPPORT_METHODS'
,
'1'
))
self
.
w8a8_strategy
=
envs
.
VLLM_W8A8_BACKEND
def
get_input_embeddings
(
self
,
input_ids
:
torch
.
Tensor
)
->
torch
.
Tensor
:
def
get_input_embeddings
(
self
,
input_ids
:
torch
.
Tensor
)
->
torch
.
Tensor
:
return
self
.
model
.
get_input_embeddings
(
input_ids
)
return
self
.
model
.
get_input_embeddings
(
input_ids
)
...
...
vllm/model_executor/models/qwen3_moe.py
View file @
a5a9263e
...
@@ -453,7 +453,7 @@ class Qwen3MoeModel(nn.Module):
...
@@ -453,7 +453,7 @@ class Qwen3MoeModel(nn.Module):
self
.
use_gemm_pad
=
os
.
environ
.
get
(
'GEMM_PAD'
)
==
'1'
self
.
use_gemm_pad
=
os
.
environ
.
get
(
'GEMM_PAD'
)
==
'1'
self
.
use_fa_pad
=
os
.
environ
.
get
(
'FA_PAD'
)
==
'1'
self
.
use_fa_pad
=
os
.
environ
.
get
(
'FA_PAD'
)
==
'1'
self
.
use_awq_pad
=
os
.
environ
.
get
(
'AWQ_PAD'
)
==
'1'
self
.
use_awq_pad
=
os
.
environ
.
get
(
'AWQ_PAD'
)
==
'1'
self
.
w8a8_strategy
=
int
(
os
.
getenv
(
'W8A8_SUPPORT_METHODS'
,
'1'
))
self
.
w8a8_strategy
=
envs
.
VLLM_W8A8_BACKEND
def
get_input_embeddings
(
self
,
input_ids
:
torch
.
Tensor
)
->
torch
.
Tensor
:
def
get_input_embeddings
(
self
,
input_ids
:
torch
.
Tensor
)
->
torch
.
Tensor
:
return
self
.
embed_tokens
(
input_ids
)
return
self
.
embed_tokens
(
input_ids
)
...
...
vllm/model_executor/models/telechat2.py
View file @
a5a9263e
...
@@ -37,6 +37,7 @@ from .utils import (AutoWeightsLoader, PPMissingLayer, WeightsMapper,
...
@@ -37,6 +37,7 @@ from .utils import (AutoWeightsLoader, PPMissingLayer, WeightsMapper,
from
vllm.utils
import
W8a8GetCacheJSON
from
vllm.utils
import
W8a8GetCacheJSON
from
vllm
import
_custom_ops
as
ops
from
vllm
import
_custom_ops
as
ops
from
vllm.model_executor.utils
import
pad_weight
,
gemm_bank_conf
from
vllm.model_executor.utils
import
pad_weight
,
gemm_bank_conf
import
vllm.envs
as
envs
class
TeleChat2Model
(
LlamaModel
):
class
TeleChat2Model
(
LlamaModel
):
...
@@ -66,8 +67,7 @@ class TeleChat2Model(LlamaModel):
...
@@ -66,8 +67,7 @@ class TeleChat2Model(LlamaModel):
self
.
use_gemm_pad
=
os
.
environ
.
get
(
'GEMM_PAD'
)
==
'1'
self
.
use_gemm_pad
=
os
.
environ
.
get
(
'GEMM_PAD'
)
==
'1'
self
.
use_fa_pad
=
os
.
environ
.
get
(
'FA_PAD'
)
==
'1'
self
.
use_fa_pad
=
os
.
environ
.
get
(
'FA_PAD'
)
==
'1'
self
.
use_awq_pad
=
os
.
environ
.
get
(
'AWQ_PAD'
)
==
'1'
self
.
use_awq_pad
=
os
.
environ
.
get
(
'AWQ_PAD'
)
==
'1'
self
.
w8a8_strategy
=
int
(
os
.
getenv
(
'W8A8_SUPPORT_METHODS'
,
'1'
))
self
.
w8a8_strategy
=
envs
.
VLLM_W8A8_BACKEND
def
load_weights
(
self
,
weights
:
Iterable
[
tuple
[
str
,
def
load_weights
(
self
,
weights
:
Iterable
[
tuple
[
str
,
torch
.
Tensor
]])
->
set
[
str
]:
torch
.
Tensor
]])
->
set
[
str
]:
stacked_params_mapping
=
[
stacked_params_mapping
=
[
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment