[Misc] rename torch_dtype to dtype (#26695)

Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com>

[Misc] rename torch_dtype to dtype (#26695)
Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com>
8f4b313c · wangxiyuan · GitHub · f93e3480 · 8f4b313c · 8f4b313c
Unverified Commit 8f4b313c authored Oct 15, 2025 by wangxiyuan Committed by GitHub Oct 15, 2025
20 changed files
--- a/benchmarks/kernels/benchmark_moe.py
+++ b/benchmarks/kernels/benchmark_moe.py
@@ -631,7 +631,7 @@ def main(args: argparse.Namespace):
    else:
        ensure_divisibility(intermediate_size, args.tp_size, "intermediate_size")
        shard_intermediate_size = 2 * intermediate_size // args.tp_size
-    dtype = torch.float16 if current_platform.is_rocm() else config.torch_dtype
+    dtype = torch.float16 if current_platform.is_rocm() else config.dtype
    use_fp8_w8a8 = args.dtype == "fp8_w8a8"
    use_int8_w8a16 = args.dtype == "int8_w8a16"
    block_quant_shape = get_weight_block_size_safety(config)

--- a/benchmarks/kernels/benchmark_moe_permute_unpermute.py
+++ b/benchmarks/kernels/benchmark_moe_permute_unpermute.py
@@ -344,7 +344,7 @@ def main(args: argparse.Namespace):
        topk = config.num_experts_per_tok

    hidden_size = config.hidden_size
-    dtype = torch.float16 if current_platform.is_rocm() else config.torch_dtype
+    dtype = torch.float16 if current_platform.is_rocm() else config.dtype
    use_fp8_w8a8 = args.dtype == "fp8_w8a8"
    use_int8_w8a16 = args.dtype == "int8_w8a16"
    use_customized_permute = args.use_customized_permute

--- a/docs/features/quantization/auto_round.md
+++ b/docs/features/quantization/auto_round.md
@@ -58,7 +58,7 @@ from transformers import AutoModelForCausalLM, AutoTokenizer
 from auto_round import AutoRound

 model_name = "Qwen/Qwen3-0.6B"
-model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto")
+model = AutoModelForCausalLM.from_pretrained(model_name, dtype="auto")
 tokenizer = AutoTokenizer.from_pretrained(model_name)

 bits, group_size, sym = 4, 128, True

--- a/docs/features/quantization/fp8.md
+++ b/docs/features/quantization/fp8.md
@@ -43,7 +43,7 @@ MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct"
 model = AutoModelForCausalLM.from_pretrained(
    MODEL_ID,
    device_map="auto",
-    torch_dtype="auto",
+    dtype="auto",
 )
 tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
 ```

--- a/docs/features/quantization/int4.md
+++ b/docs/features/quantization/int4.md
@@ -41,7 +41,7 @@ MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct"
 model = AutoModelForCausalLM.from_pretrained(
    MODEL_ID,
    device_map="auto",
-    torch_dtype="auto",
+    dtype="auto",
 )
 tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
 ```

--- a/docs/features/quantization/int8.md
+++ b/docs/features/quantization/int8.md
@@ -46,7 +46,7 @@ MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct"
 model = AutoModelForCausalLM.from_pretrained(
    MODEL_ID,
    device_map="auto",
-    torch_dtype="auto",
+    dtype="auto",
 )
 tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
 ```

--- a/docs/features/quantization/quantized_kvcache.md
+++ b/docs/features/quantization/quantized_kvcache.md
@@ -82,7 +82,7 @@ Here's a complete example using `meta-llama/Llama-3.1-8B-Instruct` (most models

    # Select model and load it
    MODEL_ID = "meta-llama/Llama-3.1-8B-Instruct"
-    model = AutoModelForCausalLM.from_pretrained(MODEL_ID, device_map="auto", torch_dtype="auto")
+    model = AutoModelForCausalLM.from_pretrained(MODEL_ID, device_map="auto", dtype="auto")
    tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)

    # Select calibration dataset

--- a/docs/features/quantization/quark.md
+++ b/docs/features/quantization/quark.md
@@ -50,7 +50,7 @@ to fetch model and tokenizer.
    model = AutoModelForCausalLM.from_pretrained(
        MODEL_ID,
        device_map="auto",
-        torch_dtype="auto",
+        dtype="auto",
    )
    model.eval()


--- a/docs/features/quantization/torchao.md
+++ b/docs/features/quantization/torchao.md
@@ -27,7 +27,7 @@ You can quantize your own huggingface model with torchao, e.g. [transformers](ht
    quantization_config = TorchAoConfig(Int8WeightOnlyConfig())
    quantized_model = AutoModelForCausalLM.from_pretrained(
        model_name,
-        torch_dtype="auto",
+        dtype="auto",
        device_map="auto",
        quantization_config=quantization_config
    )

--- a/requirements/common.txt
+++ b/requirements/common.txt
@@ -7,7 +7,7 @@ requests >= 2.26.0
 tqdm
 blake3
 py-cpuinfo
-transformers >= 4.55.2
+transformers >= 4.56.0
 tokenizers >= 0.21.1  # Required for fast incremental detokenization.
 protobuf # Required by LlamaTokenizer.
 fastapi[standard] >= 0.115.0 # Required by FastAPI's form models in the OpenAI API server's audio transcriptions endpoint.

--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -334,7 +334,7 @@ class HfRunner:
            trust_remote_code=trust_remote_code,
        )
        self.device = self.get_default_device()
-        self.dtype = torch_dtype = _get_and_verify_dtype(
+        self.dtype = dtype = _get_and_verify_dtype(
            self.model_name,
            self.config,
            dtype=dtype,
@@ -342,7 +342,7 @@ class HfRunner:
        )

        model_kwargs = model_kwargs if model_kwargs is not None else {}
-        model_kwargs.setdefault("torch_dtype", torch_dtype)
+        model_kwargs.setdefault("dtype", dtype)

        if is_sentence_transformer:
            # Lazy init required for AMD CI
@@ -388,7 +388,7 @@ class HfRunner:
        if not skip_tokenizer_init:
            self.tokenizer = AutoTokenizer.from_pretrained(
                model_name,
-                torch_dtype=torch_dtype,
+                dtype=dtype,
                trust_remote_code=trust_remote_code,
            )

@@ -398,7 +398,7 @@ class HfRunner:

        self.processor = AutoProcessor.from_pretrained(
            model_name,
-            torch_dtype=torch_dtype,
+            dtype=dtype,
            trust_remote_code=trust_remote_code,
        )
        if skip_tokenizer_init:

--- a/tests/models/multimodal/pooling/test_intern_vit.py
+++ b/tests/models/multimodal/pooling/test_intern_vit.py
@@ -38,7 +38,7 @@ def run_intern_vit_test(
        config.norm_type = "rms_norm"

    hf_model = AutoModel.from_pretrained(
-        model, torch_dtype=torch_dtype, trust_remote_code=True
+        model, dtype=torch_dtype, trust_remote_code=True
    ).to("cuda")
    hf_outputs_per_image = [
        hf_model(pixel_value.to("cuda")).last_hidden_state

--- a/tests/models/multimodal/pooling/test_radio.py
+++ b/tests/models/multimodal/pooling/test_radio.py
@@ -45,7 +45,7 @@ def run_radio_test(
    hf_model = AutoModel.from_pretrained(
        model_id,
        config=config,
-        torch_dtype=torch_dtype,
+        dtype=torch_dtype,
        trust_remote_code=True,
    ).to("cuda")
    hf_model.eval()

--- a/vllm/benchmarks/throughput.py
+++ b/vllm/benchmarks/throughput.py
@@ -251,7 +251,7 @@ def run_hf(
    disable_detokenize: bool = False,
 ) -> float:
    llm = AutoModelForCausalLM.from_pretrained(
-        model, torch_dtype=torch.float16, trust_remote_code=trust_remote_code
+        model, dtype=torch.float16, trust_remote_code=trust_remote_code
    )
    if llm.config.model_type == "llama":
        # To enable padding in the HF backend.

--- a/vllm/config/model.py
+++ b/vllm/config/model.py
@@ -1837,18 +1837,18 @@ def _find_dtype(
    *,
    revision: str | None,
 ):
-    # NOTE: getattr(config, "torch_dtype", torch.float32) is not correct
-    # because config.torch_dtype can be None.
-    config_dtype = getattr(config, "torch_dtype", None)
+    # NOTE: getattr(config, "dtype", torch.float32) is not correct
+    # because config.dtype can be None.
+    config_dtype = getattr(config, "dtype", None)

    # Fallbacks for multi-modal models if the root config
-    # does not define torch_dtype
+    # does not define dtype
    if config_dtype is None:
-        config_dtype = getattr(config.get_text_config(), "torch_dtype", None)
+        config_dtype = getattr(config.get_text_config(), "dtype", None)
    if config_dtype is None and hasattr(config, "vision_config"):
-        config_dtype = getattr(config.vision_config, "torch_dtype", None)
+        config_dtype = getattr(config.vision_config, "dtype", None)
    if config_dtype is None and hasattr(config, "encoder_config"):
-        config_dtype = getattr(config.encoder_config, "torch_dtype", None)
+        config_dtype = getattr(config.encoder_config, "dtype", None)

    # Try to read the dtype of the weights if they are in safetensors format
    if config_dtype is None:

--- a/vllm/entrypoints/llm.py
+++ b/vllm/entrypoints/llm.py
@@ -117,9 +117,8 @@ class LLM:
            execution with tensor parallelism.
        dtype: The data type for the model weights and activations. Currently,
            we support `float32`, `float16`, and `bfloat16`. If `auto`, we use
-            the `torch_dtype` attribute specified in the model config file.
-            However, if the `torch_dtype` in the config is `float32`, we will
-            use `float16` instead.
+            the `dtype` attribute of the Transformers model's config. However,
+            if the `dtype` in the config is `float32`, we will use `float16` instead.
        quantization: The method used to quantize the model weights. Currently,
            we support "awq", "gptq", and "fp8" (experimental).
            If None, we first check the `quantization_config` attribute in the

--- a/vllm/model_executor/model_loader/tensorizer.py
+++ b/vllm/model_executor/model_loader/tensorizer.py
@@ -518,7 +518,7 @@ def init_tensorizer_model(
 ) -> nn.Module:
    assert tensorizer_config.hf_config is not None
    model_args = tensorizer_config.hf_config
-    model_args.torch_dtype = tensorizer_config.dtype
+    model_args.dtype = tensorizer_config.dtype
    assert tensorizer_config.model_class is not None
    # TODO: Do we need to consider old-style model class?
    with meta_tensor_mode(), set_current_vllm_config(vllm_config, check_compile=True):

--- a/vllm/model_executor/models/chameleon.py
+++ b/vllm/model_executor/models/chameleon.py
@@ -999,7 +999,7 @@ class ChameleonForConditionalGeneration(
            return []
        assert self.model.vqmodel is not None
        image_tokens = self.model.get_image_tokens(
-            image_input["data"].to(self.config.torch_dtype)
+            image_input["data"].to(self.config.dtype)
        )
        vision_embeddings = self.model.get_input_embeddings(image_tokens)
        return vision_embeddings

--- a/vllm/model_executor/models/ernie45_vl.py
+++ b/vllm/model_executor/models/ernie45_vl.py
@@ -1089,7 +1089,7 @@ class Ernie4_5VLMultiModalProcessor(BaseMultiModalProcessor[Ernie4_5_VLProcessin
        pixel_values = (
            rescale_factor * pixel_values.to(torch.float32) - image_mean_tensor
        ) / image_std_tensor
-        pixel_values = pixel_values.to(hf_config.torch_dtype)
+        pixel_values = pixel_values.to(hf_config.dtype)
        return pixel_values

    def _call_hf_processor(

--- a/vllm/model_executor/models/glm4v.py
+++ b/vllm/model_executor/models/glm4v.py
@@ -615,7 +615,7 @@ class GLM4VForCausalLM(
        return None

    def _process_image_input(self, image_input: GLMVImagePixelInputs) -> torch.Tensor:
-        pixel_values = image_input["data"].to(dtype=self.config.torch_dtype)
+        pixel_values = image_input["data"].to(dtype=self.config.dtype)

        return self.transformer.vision(pixel_values)