Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
6a14c9f3
Commit
6a14c9f3
authored
Jan 14, 2026
by
zhuwenwen
Browse files
Merge branch 'v0.11.0-dev' of
http://10.16.6.30/dcutoolkit/deeplearing/vllm
into v0.11.0-dev
parents
0e607f8e
1a64d266
Changes
7
Hide whitespace changes
Inline
Side-by-side
Showing
7 changed files
with
19 additions
and
7 deletions
+19
-7
vllm/envs.py
vllm/envs.py
+9
-0
vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
...ers/quantization/compressed_tensors/compressed_tensors.py
+1
-1
vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py
...ompressed_tensors/schemes/compressed_tensors_w8a8_int8.py
+2
-1
vllm/model_executor/models/falcon.py
vllm/model_executor/models/falcon.py
+2
-1
vllm/model_executor/models/glm4.py
vllm/model_executor/models/glm4.py
+2
-2
vllm/model_executor/models/qwen3_moe.py
vllm/model_executor/models/qwen3_moe.py
+1
-1
vllm/model_executor/models/telechat2.py
vllm/model_executor/models/telechat2.py
+2
-1
No files found.
vllm/envs.py
View file @
6a14c9f3
...
@@ -246,6 +246,7 @@ if TYPE_CHECKING:
...
@@ -246,6 +246,7 @@ if TYPE_CHECKING:
VLLM_USE_MARLIN_W16A16_MOE
:
bool
=
False
VLLM_USE_MARLIN_W16A16_MOE
:
bool
=
False
VLLM_V1_FAST_TOKEN_ID_COPY
:
bool
=
False
VLLM_V1_FAST_TOKEN_ID_COPY
:
bool
=
False
VLLM_V1_USE_REDUCED_TOPK_TOPP_SAMPLER
:
bool
=
False
VLLM_V1_USE_REDUCED_TOPK_TOPP_SAMPLER
:
bool
=
False
VLLM_W8A8_BACKEND
:
int
=
3
def
get_default_cache_root
():
def
get_default_cache_root
():
return
os
.
getenv
(
return
os
.
getenv
(
...
@@ -1694,6 +1695,13 @@ environment_variables: dict[str, Callable[[], Any]] = {
...
@@ -1694,6 +1695,13 @@ environment_variables: dict[str, Callable[[], Any]] = {
"VLLM_V1_USE_REDUCED_TOPK_TOPP_SAMPLER"
:
"VLLM_V1_USE_REDUCED_TOPK_TOPP_SAMPLER"
:
lambda
:
(
os
.
getenv
(
"VLLM_V1_USE_REDUCED_TOPK_TOPP_SAMPLER"
,
lambda
:
(
os
.
getenv
(
"VLLM_V1_USE_REDUCED_TOPK_TOPP_SAMPLER"
,
"0"
).
lower
()
in
(
"true"
,
"1"
)),
"0"
).
lower
()
in
(
"true"
,
"1"
)),
# W8A8 GEMM backend selection for vLLM quantized models.
# lightop/triton: 1
# cutlass: 2 (will remove in the future)
# blaslt: 3 (default)
# rocblas: others
"VLLM_W8A8_BACKEND"
:
lambda
:
int
(
os
.
getenv
(
"VLLM_W8A8_BACKEND"
,
"3"
)),
}
}
# --8<-- [end:env-vars-definition]
# --8<-- [end:env-vars-definition]
...
@@ -1792,6 +1800,7 @@ def compute_hash() -> str:
...
@@ -1792,6 +1800,7 @@ def compute_hash() -> str:
"VLLM_ENABLE_INDUCTOR_MAX_AUTOTUNE"
,
"VLLM_ENABLE_INDUCTOR_MAX_AUTOTUNE"
,
"VLLM_ENABLE_INDUCTOR_COORDINATE_DESCENT_TUNING"
,
"VLLM_ENABLE_INDUCTOR_COORDINATE_DESCENT_TUNING"
,
"VLLM_USE_FBGEMM"
,
"VLLM_USE_FBGEMM"
,
"VLLM_W8A8_BACKEND"
,
]
]
for
key
in
environment_variables_to_hash
:
for
key
in
environment_variables_to_hash
:
# if this goes out of sync with environment_variables,
# if this goes out of sync with environment_variables,
...
...
vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
View file @
6a14c9f3
...
@@ -720,7 +720,7 @@ class CompressedTensorsLinearMethod(LinearMethodBase):
...
@@ -720,7 +720,7 @@ class CompressedTensorsLinearMethod(LinearMethodBase):
def
__init__
(
self
,
quantization_config
:
CompressedTensorsConfig
):
def
__init__
(
self
,
quantization_config
:
CompressedTensorsConfig
):
self
.
quantization_config
=
quantization_config
self
.
quantization_config
=
quantization_config
self
.
tritonsingleton
=
W8a8GetCacheJSON
()
self
.
tritonsingleton
=
W8a8GetCacheJSON
()
self
.
w8a8_strategy
=
int
(
os
.
getenv
(
'W8A8_SUPPORT_METHODS'
,
'1'
))
self
.
w8a8_strategy
=
envs
.
VLLM_W8A8_BACKEND
def
process_weights_after_loading
(
self
,
layer
:
torch
.
nn
.
Module
)
->
None
:
def
process_weights_after_loading
(
self
,
layer
:
torch
.
nn
.
Module
)
->
None
:
weights_scheme
=
(
weights_scheme
=
(
...
...
vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py
View file @
6a14c9f3
...
@@ -7,6 +7,7 @@ import torch
...
@@ -7,6 +7,7 @@ import torch
from
compressed_tensors.quantization
import
QuantizationStrategy
from
compressed_tensors.quantization
import
QuantizationStrategy
import
os
import
os
import
vllm.envs
as
envs
from
vllm.logger
import
init_logger
from
vllm.logger
import
init_logger
from
vllm.model_executor.layers.quantization.compressed_tensors.schemes
import
(
from
vllm.model_executor.layers.quantization.compressed_tensors.schemes
import
(
CompressedTensorsScheme
)
CompressedTensorsScheme
)
...
@@ -29,7 +30,7 @@ class CompressedTensorsW8A8Int8(CompressedTensorsScheme):
...
@@ -29,7 +30,7 @@ class CompressedTensorsW8A8Int8(CompressedTensorsScheme):
input_symmetric
:
bool
):
input_symmetric
:
bool
):
self
.
strategy
=
strategy
self
.
strategy
=
strategy
self
.
is_static_input_scheme
=
is_static_input_scheme
self
.
is_static_input_scheme
=
is_static_input_scheme
self
.
w8a8_strategy
=
int
(
os
.
getenv
(
'W8A8_SUPPORT_METHODS'
,
'1'
))
self
.
w8a8_strategy
=
envs
.
VLLM_W8A8_BACKEND
self
.
input_symmetric
=
input_symmetric
self
.
input_symmetric
=
input_symmetric
@
classmethod
@
classmethod
...
...
vllm/model_executor/models/falcon.py
View file @
6a14c9f3
...
@@ -32,6 +32,7 @@ from torch import nn
...
@@ -32,6 +32,7 @@ from torch import nn
from
torch.nn
import
LayerNorm
from
torch.nn
import
LayerNorm
from
transformers
import
FalconConfig
as
HF_FalconConfig
from
transformers
import
FalconConfig
as
HF_FalconConfig
import
vllm.envs
as
envs
from
vllm.attention
import
Attention
from
vllm.attention
import
Attention
from
vllm.compilation.decorators
import
support_torch_compile
from
vllm.compilation.decorators
import
support_torch_compile
from
vllm.config
import
CacheConfig
,
VllmConfig
from
vllm.config
import
CacheConfig
,
VllmConfig
...
@@ -393,7 +394,7 @@ class FalconModel(nn.Module):
...
@@ -393,7 +394,7 @@ class FalconModel(nn.Module):
self
.
use_gemm_pad
=
os
.
environ
.
get
(
'GEMM_PAD'
)
==
'1'
self
.
use_gemm_pad
=
os
.
environ
.
get
(
'GEMM_PAD'
)
==
'1'
self
.
use_fa_pad
=
os
.
environ
.
get
(
'FA_PAD'
)
==
'1'
self
.
use_fa_pad
=
os
.
environ
.
get
(
'FA_PAD'
)
==
'1'
self
.
use_awq_pad
=
os
.
environ
.
get
(
'AWQ_PAD'
)
==
'1'
self
.
use_awq_pad
=
os
.
environ
.
get
(
'AWQ_PAD'
)
==
'1'
self
.
w8a8_strategy
=
int
(
os
.
getenv
(
'W8A8_SUPPORT_METHODS'
,
'1'
))
self
.
w8a8_strategy
=
envs
.
VLLM_W8A8_BACKEND
def
get_input_embeddings
(
self
,
input_ids
:
torch
.
Tensor
)
->
torch
.
Tensor
:
def
get_input_embeddings
(
self
,
input_ids
:
torch
.
Tensor
)
->
torch
.
Tensor
:
return
self
.
word_embeddings
(
input_ids
)
return
self
.
word_embeddings
(
input_ids
)
...
...
vllm/model_executor/models/glm4.py
View file @
6a14c9f3
...
@@ -32,6 +32,7 @@ import torch
...
@@ -32,6 +32,7 @@ import torch
from
torch
import
nn
from
torch
import
nn
from
transformers
import
Glm4Config
from
transformers
import
Glm4Config
import
vllm.envs
as
envs
from
vllm.attention
import
Attention
,
AttentionType
from
vllm.attention
import
Attention
,
AttentionType
from
vllm.compilation.decorators
import
support_torch_compile
from
vllm.compilation.decorators
import
support_torch_compile
from
vllm.config
import
CacheConfig
,
VllmConfig
from
vllm.config
import
CacheConfig
,
VllmConfig
...
@@ -290,8 +291,7 @@ class Glm4ForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
...
@@ -290,8 +291,7 @@ class Glm4ForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
self
.
use_gemm_pad
=
os
.
environ
.
get
(
'GEMM_PAD'
)
==
'1'
self
.
use_gemm_pad
=
os
.
environ
.
get
(
'GEMM_PAD'
)
==
'1'
self
.
use_fa_pad
=
os
.
environ
.
get
(
'FA_PAD'
)
==
'1'
self
.
use_fa_pad
=
os
.
environ
.
get
(
'FA_PAD'
)
==
'1'
self
.
use_awq_pad
=
os
.
environ
.
get
(
'AWQ_PAD'
)
==
'1'
self
.
use_awq_pad
=
os
.
environ
.
get
(
'AWQ_PAD'
)
==
'1'
self
.
w8a8_strategy
=
int
(
os
.
getenv
(
'W8A8_SUPPORT_METHODS'
,
'1'
))
self
.
w8a8_strategy
=
envs
.
VLLM_W8A8_BACKEND
def
get_input_embeddings
(
self
,
input_ids
:
torch
.
Tensor
)
->
torch
.
Tensor
:
def
get_input_embeddings
(
self
,
input_ids
:
torch
.
Tensor
)
->
torch
.
Tensor
:
return
self
.
model
.
get_input_embeddings
(
input_ids
)
return
self
.
model
.
get_input_embeddings
(
input_ids
)
...
...
vllm/model_executor/models/qwen3_moe.py
View file @
6a14c9f3
...
@@ -613,7 +613,7 @@ class Qwen3MoeModel(nn.Module):
...
@@ -613,7 +613,7 @@ class Qwen3MoeModel(nn.Module):
self
.
use_gemm_pad
=
os
.
environ
.
get
(
'GEMM_PAD'
)
==
'1'
self
.
use_gemm_pad
=
os
.
environ
.
get
(
'GEMM_PAD'
)
==
'1'
self
.
use_fa_pad
=
os
.
environ
.
get
(
'FA_PAD'
)
==
'1'
self
.
use_fa_pad
=
os
.
environ
.
get
(
'FA_PAD'
)
==
'1'
self
.
use_awq_pad
=
os
.
environ
.
get
(
'AWQ_PAD'
)
==
'1'
self
.
use_awq_pad
=
os
.
environ
.
get
(
'AWQ_PAD'
)
==
'1'
self
.
w8a8_strategy
=
int
(
os
.
getenv
(
'W8A8_SUPPORT_METHODS'
,
'1'
))
self
.
w8a8_strategy
=
envs
.
VLLM_W8A8_BACKEND
def
get_input_embeddings
(
self
,
input_ids
:
torch
.
Tensor
)
->
torch
.
Tensor
:
def
get_input_embeddings
(
self
,
input_ids
:
torch
.
Tensor
)
->
torch
.
Tensor
:
return
self
.
embed_tokens
(
input_ids
)
return
self
.
embed_tokens
(
input_ids
)
...
...
vllm/model_executor/models/telechat2.py
View file @
6a14c9f3
...
@@ -27,6 +27,7 @@ import re
...
@@ -27,6 +27,7 @@ import re
import
torch
import
torch
import
torch.nn
as
nn
import
torch.nn
as
nn
from
vllm.envs
import
envs
from
vllm.config
import
VllmConfig
from
vllm.config
import
VllmConfig
from
vllm.model_executor.model_loader.weight_utils
import
default_weight_loader
from
vllm.model_executor.model_loader.weight_utils
import
default_weight_loader
from
vllm.model_executor.models.llama
import
LlamaForCausalLM
,
LlamaModel
from
vllm.model_executor.models.llama
import
LlamaForCausalLM
,
LlamaModel
...
@@ -77,7 +78,7 @@ class TeleChat2Model(LlamaModel):
...
@@ -77,7 +78,7 @@ class TeleChat2Model(LlamaModel):
self
.
use_gemm_pad
=
os
.
environ
.
get
(
'GEMM_PAD'
)
==
'1'
self
.
use_gemm_pad
=
os
.
environ
.
get
(
'GEMM_PAD'
)
==
'1'
self
.
use_fa_pad
=
os
.
environ
.
get
(
'FA_PAD'
)
==
'1'
self
.
use_fa_pad
=
os
.
environ
.
get
(
'FA_PAD'
)
==
'1'
self
.
use_awq_pad
=
os
.
environ
.
get
(
'AWQ_PAD'
)
==
'1'
self
.
use_awq_pad
=
os
.
environ
.
get
(
'AWQ_PAD'
)
==
'1'
self
.
w8a8_strategy
=
int
(
os
.
getenv
(
'W8A8_SUPPORT_METHODS'
,
'1'
))
self
.
w8a8_strategy
=
envs
.
VLLM_W8A8_BACKEND
def
load_weights
(
self
,
weights
:
Iterable
[
tuple
[
str
,
def
load_weights
(
self
,
weights
:
Iterable
[
tuple
[
str
,
torch
.
Tensor
]])
->
set
[
str
]:
torch
.
Tensor
]])
->
set
[
str
]:
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment