Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
8f4b313c
Unverified
Commit
8f4b313c
authored
Oct 15, 2025
by
wangxiyuan
Committed by
GitHub
Oct 15, 2025
Browse files
[Misc] rename torch_dtype to dtype (#26695)
Signed-off-by:
wangxiyuan
<
wangxiyuan1007@gmail.com
>
parent
f93e3480
Changes
30
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
30 additions
and
31 deletions
+30
-31
benchmarks/kernels/benchmark_moe.py
benchmarks/kernels/benchmark_moe.py
+1
-1
benchmarks/kernels/benchmark_moe_permute_unpermute.py
benchmarks/kernels/benchmark_moe_permute_unpermute.py
+1
-1
docs/features/quantization/auto_round.md
docs/features/quantization/auto_round.md
+1
-1
docs/features/quantization/fp8.md
docs/features/quantization/fp8.md
+1
-1
docs/features/quantization/int4.md
docs/features/quantization/int4.md
+1
-1
docs/features/quantization/int8.md
docs/features/quantization/int8.md
+1
-1
docs/features/quantization/quantized_kvcache.md
docs/features/quantization/quantized_kvcache.md
+1
-1
docs/features/quantization/quark.md
docs/features/quantization/quark.md
+1
-1
docs/features/quantization/torchao.md
docs/features/quantization/torchao.md
+1
-1
requirements/common.txt
requirements/common.txt
+1
-1
tests/conftest.py
tests/conftest.py
+4
-4
tests/models/multimodal/pooling/test_intern_vit.py
tests/models/multimodal/pooling/test_intern_vit.py
+1
-1
tests/models/multimodal/pooling/test_radio.py
tests/models/multimodal/pooling/test_radio.py
+1
-1
vllm/benchmarks/throughput.py
vllm/benchmarks/throughput.py
+1
-1
vllm/config/model.py
vllm/config/model.py
+7
-7
vllm/entrypoints/llm.py
vllm/entrypoints/llm.py
+2
-3
vllm/model_executor/model_loader/tensorizer.py
vllm/model_executor/model_loader/tensorizer.py
+1
-1
vllm/model_executor/models/chameleon.py
vllm/model_executor/models/chameleon.py
+1
-1
vllm/model_executor/models/ernie45_vl.py
vllm/model_executor/models/ernie45_vl.py
+1
-1
vllm/model_executor/models/glm4v.py
vllm/model_executor/models/glm4v.py
+1
-1
No files found.
benchmarks/kernels/benchmark_moe.py
View file @
8f4b313c
...
@@ -631,7 +631,7 @@ def main(args: argparse.Namespace):
...
@@ -631,7 +631,7 @@ def main(args: argparse.Namespace):
else
:
else
:
ensure_divisibility
(
intermediate_size
,
args
.
tp_size
,
"intermediate_size"
)
ensure_divisibility
(
intermediate_size
,
args
.
tp_size
,
"intermediate_size"
)
shard_intermediate_size
=
2
*
intermediate_size
//
args
.
tp_size
shard_intermediate_size
=
2
*
intermediate_size
//
args
.
tp_size
dtype
=
torch
.
float16
if
current_platform
.
is_rocm
()
else
config
.
torch_
dtype
dtype
=
torch
.
float16
if
current_platform
.
is_rocm
()
else
config
.
dtype
use_fp8_w8a8
=
args
.
dtype
==
"fp8_w8a8"
use_fp8_w8a8
=
args
.
dtype
==
"fp8_w8a8"
use_int8_w8a16
=
args
.
dtype
==
"int8_w8a16"
use_int8_w8a16
=
args
.
dtype
==
"int8_w8a16"
block_quant_shape
=
get_weight_block_size_safety
(
config
)
block_quant_shape
=
get_weight_block_size_safety
(
config
)
...
...
benchmarks/kernels/benchmark_moe_permute_unpermute.py
View file @
8f4b313c
...
@@ -344,7 +344,7 @@ def main(args: argparse.Namespace):
...
@@ -344,7 +344,7 @@ def main(args: argparse.Namespace):
topk
=
config
.
num_experts_per_tok
topk
=
config
.
num_experts_per_tok
hidden_size
=
config
.
hidden_size
hidden_size
=
config
.
hidden_size
dtype
=
torch
.
float16
if
current_platform
.
is_rocm
()
else
config
.
torch_
dtype
dtype
=
torch
.
float16
if
current_platform
.
is_rocm
()
else
config
.
dtype
use_fp8_w8a8
=
args
.
dtype
==
"fp8_w8a8"
use_fp8_w8a8
=
args
.
dtype
==
"fp8_w8a8"
use_int8_w8a16
=
args
.
dtype
==
"int8_w8a16"
use_int8_w8a16
=
args
.
dtype
==
"int8_w8a16"
use_customized_permute
=
args
.
use_customized_permute
use_customized_permute
=
args
.
use_customized_permute
...
...
docs/features/quantization/auto_round.md
View file @
8f4b313c
...
@@ -58,7 +58,7 @@ from transformers import AutoModelForCausalLM, AutoTokenizer
...
@@ -58,7 +58,7 @@ from transformers import AutoModelForCausalLM, AutoTokenizer
from
auto_round
import
AutoRound
from
auto_round
import
AutoRound
model_name
=
"Qwen/Qwen3-0.6B"
model_name
=
"Qwen/Qwen3-0.6B"
model
=
AutoModelForCausalLM
.
from_pretrained
(
model_name
,
torch_
dtype
=
"auto"
)
model
=
AutoModelForCausalLM
.
from_pretrained
(
model_name
,
dtype
=
"auto"
)
tokenizer
=
AutoTokenizer
.
from_pretrained
(
model_name
)
tokenizer
=
AutoTokenizer
.
from_pretrained
(
model_name
)
bits
,
group_size
,
sym
=
4
,
128
,
True
bits
,
group_size
,
sym
=
4
,
128
,
True
...
...
docs/features/quantization/fp8.md
View file @
8f4b313c
...
@@ -43,7 +43,7 @@ MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct"
...
@@ -43,7 +43,7 @@ MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct"
model
=
AutoModelForCausalLM
.
from_pretrained
(
model
=
AutoModelForCausalLM
.
from_pretrained
(
MODEL_ID
,
MODEL_ID
,
device_map
=
"auto"
,
device_map
=
"auto"
,
torch_
dtype
=
"auto"
,
dtype
=
"auto"
,
)
)
tokenizer
=
AutoTokenizer
.
from_pretrained
(
MODEL_ID
)
tokenizer
=
AutoTokenizer
.
from_pretrained
(
MODEL_ID
)
```
```
...
...
docs/features/quantization/int4.md
View file @
8f4b313c
...
@@ -41,7 +41,7 @@ MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct"
...
@@ -41,7 +41,7 @@ MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct"
model
=
AutoModelForCausalLM
.
from_pretrained
(
model
=
AutoModelForCausalLM
.
from_pretrained
(
MODEL_ID
,
MODEL_ID
,
device_map
=
"auto"
,
device_map
=
"auto"
,
torch_
dtype
=
"auto"
,
dtype
=
"auto"
,
)
)
tokenizer
=
AutoTokenizer
.
from_pretrained
(
MODEL_ID
)
tokenizer
=
AutoTokenizer
.
from_pretrained
(
MODEL_ID
)
```
```
...
...
docs/features/quantization/int8.md
View file @
8f4b313c
...
@@ -46,7 +46,7 @@ MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct"
...
@@ -46,7 +46,7 @@ MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct"
model
=
AutoModelForCausalLM
.
from_pretrained
(
model
=
AutoModelForCausalLM
.
from_pretrained
(
MODEL_ID
,
MODEL_ID
,
device_map
=
"auto"
,
device_map
=
"auto"
,
torch_
dtype
=
"auto"
,
dtype
=
"auto"
,
)
)
tokenizer
=
AutoTokenizer
.
from_pretrained
(
MODEL_ID
)
tokenizer
=
AutoTokenizer
.
from_pretrained
(
MODEL_ID
)
```
```
...
...
docs/features/quantization/quantized_kvcache.md
View file @
8f4b313c
...
@@ -82,7 +82,7 @@ Here's a complete example using `meta-llama/Llama-3.1-8B-Instruct` (most models
...
@@ -82,7 +82,7 @@ Here's a complete example using `meta-llama/Llama-3.1-8B-Instruct` (most models
# Select model and load it
# Select model and load it
MODEL_ID = "meta-llama/Llama-3.1-8B-Instruct"
MODEL_ID = "meta-llama/Llama-3.1-8B-Instruct"
model = AutoModelForCausalLM.from_pretrained(MODEL_ID, device_map="auto",
torch_
dtype="auto")
model = AutoModelForCausalLM.from_pretrained(MODEL_ID, device_map="auto", dtype="auto")
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
# Select calibration dataset
# Select calibration dataset
...
...
docs/features/quantization/quark.md
View file @
8f4b313c
...
@@ -50,7 +50,7 @@ to fetch model and tokenizer.
...
@@ -50,7 +50,7 @@ to fetch model and tokenizer.
model = AutoModelForCausalLM.from_pretrained(
model = AutoModelForCausalLM.from_pretrained(
MODEL_ID,
MODEL_ID,
device_map="auto",
device_map="auto",
torch_
dtype="auto",
dtype="auto",
)
)
model.eval()
model.eval()
...
...
docs/features/quantization/torchao.md
View file @
8f4b313c
...
@@ -27,7 +27,7 @@ You can quantize your own huggingface model with torchao, e.g. [transformers](ht
...
@@ -27,7 +27,7 @@ You can quantize your own huggingface model with torchao, e.g. [transformers](ht
quantization_config = TorchAoConfig(Int8WeightOnlyConfig())
quantization_config = TorchAoConfig(Int8WeightOnlyConfig())
quantized_model = AutoModelForCausalLM.from_pretrained(
quantized_model = AutoModelForCausalLM.from_pretrained(
model_name,
model_name,
torch_
dtype="auto",
dtype="auto",
device_map="auto",
device_map="auto",
quantization_config=quantization_config
quantization_config=quantization_config
)
)
...
...
requirements/common.txt
View file @
8f4b313c
...
@@ -7,7 +7,7 @@ requests >= 2.26.0
...
@@ -7,7 +7,7 @@ requests >= 2.26.0
tqdm
tqdm
blake3
blake3
py-cpuinfo
py-cpuinfo
transformers >= 4.5
5.2
transformers >= 4.5
6.0
tokenizers >= 0.21.1 # Required for fast incremental detokenization.
tokenizers >= 0.21.1 # Required for fast incremental detokenization.
protobuf # Required by LlamaTokenizer.
protobuf # Required by LlamaTokenizer.
fastapi[standard] >= 0.115.0 # Required by FastAPI's form models in the OpenAI API server's audio transcriptions endpoint.
fastapi[standard] >= 0.115.0 # Required by FastAPI's form models in the OpenAI API server's audio transcriptions endpoint.
...
...
tests/conftest.py
View file @
8f4b313c
...
@@ -334,7 +334,7 @@ class HfRunner:
...
@@ -334,7 +334,7 @@ class HfRunner:
trust_remote_code
=
trust_remote_code
,
trust_remote_code
=
trust_remote_code
,
)
)
self
.
device
=
self
.
get_default_device
()
self
.
device
=
self
.
get_default_device
()
self
.
dtype
=
torch_
dtype
=
_get_and_verify_dtype
(
self
.
dtype
=
dtype
=
_get_and_verify_dtype
(
self
.
model_name
,
self
.
model_name
,
self
.
config
,
self
.
config
,
dtype
=
dtype
,
dtype
=
dtype
,
...
@@ -342,7 +342,7 @@ class HfRunner:
...
@@ -342,7 +342,7 @@ class HfRunner:
)
)
model_kwargs
=
model_kwargs
if
model_kwargs
is
not
None
else
{}
model_kwargs
=
model_kwargs
if
model_kwargs
is
not
None
else
{}
model_kwargs
.
setdefault
(
"
torch_
dtype"
,
torch_
dtype
)
model_kwargs
.
setdefault
(
"dtype"
,
dtype
)
if
is_sentence_transformer
:
if
is_sentence_transformer
:
# Lazy init required for AMD CI
# Lazy init required for AMD CI
...
@@ -388,7 +388,7 @@ class HfRunner:
...
@@ -388,7 +388,7 @@ class HfRunner:
if
not
skip_tokenizer_init
:
if
not
skip_tokenizer_init
:
self
.
tokenizer
=
AutoTokenizer
.
from_pretrained
(
self
.
tokenizer
=
AutoTokenizer
.
from_pretrained
(
model_name
,
model_name
,
torch_dtype
=
torch_
dtype
,
dtype
=
dtype
,
trust_remote_code
=
trust_remote_code
,
trust_remote_code
=
trust_remote_code
,
)
)
...
@@ -398,7 +398,7 @@ class HfRunner:
...
@@ -398,7 +398,7 @@ class HfRunner:
self
.
processor
=
AutoProcessor
.
from_pretrained
(
self
.
processor
=
AutoProcessor
.
from_pretrained
(
model_name
,
model_name
,
torch_dtype
=
torch_
dtype
,
dtype
=
dtype
,
trust_remote_code
=
trust_remote_code
,
trust_remote_code
=
trust_remote_code
,
)
)
if
skip_tokenizer_init
:
if
skip_tokenizer_init
:
...
...
tests/models/multimodal/pooling/test_intern_vit.py
View file @
8f4b313c
...
@@ -38,7 +38,7 @@ def run_intern_vit_test(
...
@@ -38,7 +38,7 @@ def run_intern_vit_test(
config
.
norm_type
=
"rms_norm"
config
.
norm_type
=
"rms_norm"
hf_model
=
AutoModel
.
from_pretrained
(
hf_model
=
AutoModel
.
from_pretrained
(
model
,
torch_
dtype
=
torch_dtype
,
trust_remote_code
=
True
model
,
dtype
=
torch_dtype
,
trust_remote_code
=
True
).
to
(
"cuda"
)
).
to
(
"cuda"
)
hf_outputs_per_image
=
[
hf_outputs_per_image
=
[
hf_model
(
pixel_value
.
to
(
"cuda"
)).
last_hidden_state
hf_model
(
pixel_value
.
to
(
"cuda"
)).
last_hidden_state
...
...
tests/models/multimodal/pooling/test_radio.py
View file @
8f4b313c
...
@@ -45,7 +45,7 @@ def run_radio_test(
...
@@ -45,7 +45,7 @@ def run_radio_test(
hf_model
=
AutoModel
.
from_pretrained
(
hf_model
=
AutoModel
.
from_pretrained
(
model_id
,
model_id
,
config
=
config
,
config
=
config
,
torch_
dtype
=
torch_dtype
,
dtype
=
torch_dtype
,
trust_remote_code
=
True
,
trust_remote_code
=
True
,
).
to
(
"cuda"
)
).
to
(
"cuda"
)
hf_model
.
eval
()
hf_model
.
eval
()
...
...
vllm/benchmarks/throughput.py
View file @
8f4b313c
...
@@ -251,7 +251,7 @@ def run_hf(
...
@@ -251,7 +251,7 @@ def run_hf(
disable_detokenize
:
bool
=
False
,
disable_detokenize
:
bool
=
False
,
)
->
float
:
)
->
float
:
llm
=
AutoModelForCausalLM
.
from_pretrained
(
llm
=
AutoModelForCausalLM
.
from_pretrained
(
model
,
torch_
dtype
=
torch
.
float16
,
trust_remote_code
=
trust_remote_code
model
,
dtype
=
torch
.
float16
,
trust_remote_code
=
trust_remote_code
)
)
if
llm
.
config
.
model_type
==
"llama"
:
if
llm
.
config
.
model_type
==
"llama"
:
# To enable padding in the HF backend.
# To enable padding in the HF backend.
...
...
vllm/config/model.py
View file @
8f4b313c
...
@@ -1837,18 +1837,18 @@ def _find_dtype(
...
@@ -1837,18 +1837,18 @@ def _find_dtype(
*
,
*
,
revision
:
str
|
None
,
revision
:
str
|
None
,
):
):
# NOTE: getattr(config, "
torch_
dtype", torch.float32) is not correct
# NOTE: getattr(config, "dtype", torch.float32) is not correct
# because config.
torch_
dtype can be None.
# because config.dtype can be None.
config_dtype
=
getattr
(
config
,
"
torch_
dtype"
,
None
)
config_dtype
=
getattr
(
config
,
"dtype"
,
None
)
# Fallbacks for multi-modal models if the root config
# Fallbacks for multi-modal models if the root config
# does not define
torch_
dtype
# does not define dtype
if
config_dtype
is
None
:
if
config_dtype
is
None
:
config_dtype
=
getattr
(
config
.
get_text_config
(),
"
torch_
dtype"
,
None
)
config_dtype
=
getattr
(
config
.
get_text_config
(),
"dtype"
,
None
)
if
config_dtype
is
None
and
hasattr
(
config
,
"vision_config"
):
if
config_dtype
is
None
and
hasattr
(
config
,
"vision_config"
):
config_dtype
=
getattr
(
config
.
vision_config
,
"
torch_
dtype"
,
None
)
config_dtype
=
getattr
(
config
.
vision_config
,
"dtype"
,
None
)
if
config_dtype
is
None
and
hasattr
(
config
,
"encoder_config"
):
if
config_dtype
is
None
and
hasattr
(
config
,
"encoder_config"
):
config_dtype
=
getattr
(
config
.
encoder_config
,
"
torch_
dtype"
,
None
)
config_dtype
=
getattr
(
config
.
encoder_config
,
"dtype"
,
None
)
# Try to read the dtype of the weights if they are in safetensors format
# Try to read the dtype of the weights if they are in safetensors format
if
config_dtype
is
None
:
if
config_dtype
is
None
:
...
...
vllm/entrypoints/llm.py
View file @
8f4b313c
...
@@ -117,9 +117,8 @@ class LLM:
...
@@ -117,9 +117,8 @@ class LLM:
execution with tensor parallelism.
execution with tensor parallelism.
dtype: The data type for the model weights and activations. Currently,
dtype: The data type for the model weights and activations. Currently,
we support `float32`, `float16`, and `bfloat16`. If `auto`, we use
we support `float32`, `float16`, and `bfloat16`. If `auto`, we use
the `torch_dtype` attribute specified in the model config file.
the `dtype` attribute of the Transformers model's config. However,
However, if the `torch_dtype` in the config is `float32`, we will
if the `dtype` in the config is `float32`, we will use `float16` instead.
use `float16` instead.
quantization: The method used to quantize the model weights. Currently,
quantization: The method used to quantize the model weights. Currently,
we support "awq", "gptq", and "fp8" (experimental).
we support "awq", "gptq", and "fp8" (experimental).
If None, we first check the `quantization_config` attribute in the
If None, we first check the `quantization_config` attribute in the
...
...
vllm/model_executor/model_loader/tensorizer.py
View file @
8f4b313c
...
@@ -518,7 +518,7 @@ def init_tensorizer_model(
...
@@ -518,7 +518,7 @@ def init_tensorizer_model(
)
->
nn
.
Module
:
)
->
nn
.
Module
:
assert
tensorizer_config
.
hf_config
is
not
None
assert
tensorizer_config
.
hf_config
is
not
None
model_args
=
tensorizer_config
.
hf_config
model_args
=
tensorizer_config
.
hf_config
model_args
.
torch_
dtype
=
tensorizer_config
.
dtype
model_args
.
dtype
=
tensorizer_config
.
dtype
assert
tensorizer_config
.
model_class
is
not
None
assert
tensorizer_config
.
model_class
is
not
None
# TODO: Do we need to consider old-style model class?
# TODO: Do we need to consider old-style model class?
with
meta_tensor_mode
(),
set_current_vllm_config
(
vllm_config
,
check_compile
=
True
):
with
meta_tensor_mode
(),
set_current_vllm_config
(
vllm_config
,
check_compile
=
True
):
...
...
vllm/model_executor/models/chameleon.py
View file @
8f4b313c
...
@@ -999,7 +999,7 @@ class ChameleonForConditionalGeneration(
...
@@ -999,7 +999,7 @@ class ChameleonForConditionalGeneration(
return
[]
return
[]
assert
self
.
model
.
vqmodel
is
not
None
assert
self
.
model
.
vqmodel
is
not
None
image_tokens
=
self
.
model
.
get_image_tokens
(
image_tokens
=
self
.
model
.
get_image_tokens
(
image_input
[
"data"
].
to
(
self
.
config
.
torch_
dtype
)
image_input
[
"data"
].
to
(
self
.
config
.
dtype
)
)
)
vision_embeddings
=
self
.
model
.
get_input_embeddings
(
image_tokens
)
vision_embeddings
=
self
.
model
.
get_input_embeddings
(
image_tokens
)
return
vision_embeddings
return
vision_embeddings
...
...
vllm/model_executor/models/ernie45_vl.py
View file @
8f4b313c
...
@@ -1089,7 +1089,7 @@ class Ernie4_5VLMultiModalProcessor(BaseMultiModalProcessor[Ernie4_5_VLProcessin
...
@@ -1089,7 +1089,7 @@ class Ernie4_5VLMultiModalProcessor(BaseMultiModalProcessor[Ernie4_5_VLProcessin
pixel_values
=
(
pixel_values
=
(
rescale_factor
*
pixel_values
.
to
(
torch
.
float32
)
-
image_mean_tensor
rescale_factor
*
pixel_values
.
to
(
torch
.
float32
)
-
image_mean_tensor
)
/
image_std_tensor
)
/
image_std_tensor
pixel_values
=
pixel_values
.
to
(
hf_config
.
torch_
dtype
)
pixel_values
=
pixel_values
.
to
(
hf_config
.
dtype
)
return
pixel_values
return
pixel_values
def
_call_hf_processor
(
def
_call_hf_processor
(
...
...
vllm/model_executor/models/glm4v.py
View file @
8f4b313c
...
@@ -615,7 +615,7 @@ class GLM4VForCausalLM(
...
@@ -615,7 +615,7 @@ class GLM4VForCausalLM(
return
None
return
None
def
_process_image_input
(
self
,
image_input
:
GLMVImagePixelInputs
)
->
torch
.
Tensor
:
def
_process_image_input
(
self
,
image_input
:
GLMVImagePixelInputs
)
->
torch
.
Tensor
:
pixel_values
=
image_input
[
"data"
].
to
(
dtype
=
self
.
config
.
torch_
dtype
)
pixel_values
=
image_input
[
"data"
].
to
(
dtype
=
self
.
config
.
dtype
)
return
self
.
transformer
.
vision
(
pixel_values
)
return
self
.
transformer
.
vision
(
pixel_values
)
...
...
Prev
1
2
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment