Unverified Commit 8f4b313c authored by wangxiyuan's avatar wangxiyuan Committed by GitHub
Browse files

[Misc] rename torch_dtype to dtype (#26695)


Signed-off-by: default avatarwangxiyuan <wangxiyuan1007@gmail.com>
parent f93e3480
......@@ -631,7 +631,7 @@ def main(args: argparse.Namespace):
else:
ensure_divisibility(intermediate_size, args.tp_size, "intermediate_size")
shard_intermediate_size = 2 * intermediate_size // args.tp_size
dtype = torch.float16 if current_platform.is_rocm() else config.torch_dtype
dtype = torch.float16 if current_platform.is_rocm() else config.dtype
use_fp8_w8a8 = args.dtype == "fp8_w8a8"
use_int8_w8a16 = args.dtype == "int8_w8a16"
block_quant_shape = get_weight_block_size_safety(config)
......
......@@ -344,7 +344,7 @@ def main(args: argparse.Namespace):
topk = config.num_experts_per_tok
hidden_size = config.hidden_size
dtype = torch.float16 if current_platform.is_rocm() else config.torch_dtype
dtype = torch.float16 if current_platform.is_rocm() else config.dtype
use_fp8_w8a8 = args.dtype == "fp8_w8a8"
use_int8_w8a16 = args.dtype == "int8_w8a16"
use_customized_permute = args.use_customized_permute
......
......@@ -58,7 +58,7 @@ from transformers import AutoModelForCausalLM, AutoTokenizer
from auto_round import AutoRound
model_name = "Qwen/Qwen3-0.6B"
model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto")
model = AutoModelForCausalLM.from_pretrained(model_name, dtype="auto")
tokenizer = AutoTokenizer.from_pretrained(model_name)
bits, group_size, sym = 4, 128, True
......
......@@ -43,7 +43,7 @@ MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct"
model = AutoModelForCausalLM.from_pretrained(
MODEL_ID,
device_map="auto",
torch_dtype="auto",
dtype="auto",
)
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
```
......
......@@ -41,7 +41,7 @@ MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct"
model = AutoModelForCausalLM.from_pretrained(
MODEL_ID,
device_map="auto",
torch_dtype="auto",
dtype="auto",
)
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
```
......
......@@ -46,7 +46,7 @@ MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct"
model = AutoModelForCausalLM.from_pretrained(
MODEL_ID,
device_map="auto",
torch_dtype="auto",
dtype="auto",
)
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
```
......
......@@ -82,7 +82,7 @@ Here's a complete example using `meta-llama/Llama-3.1-8B-Instruct` (most models
# Select model and load it
MODEL_ID = "meta-llama/Llama-3.1-8B-Instruct"
model = AutoModelForCausalLM.from_pretrained(MODEL_ID, device_map="auto", torch_dtype="auto")
model = AutoModelForCausalLM.from_pretrained(MODEL_ID, device_map="auto", dtype="auto")
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
# Select calibration dataset
......
......@@ -50,7 +50,7 @@ to fetch model and tokenizer.
model = AutoModelForCausalLM.from_pretrained(
MODEL_ID,
device_map="auto",
torch_dtype="auto",
dtype="auto",
)
model.eval()
......
......@@ -27,7 +27,7 @@ You can quantize your own huggingface model with torchao, e.g. [transformers](ht
quantization_config = TorchAoConfig(Int8WeightOnlyConfig())
quantized_model = AutoModelForCausalLM.from_pretrained(
model_name,
torch_dtype="auto",
dtype="auto",
device_map="auto",
quantization_config=quantization_config
)
......
......@@ -7,7 +7,7 @@ requests >= 2.26.0
tqdm
blake3
py-cpuinfo
transformers >= 4.55.2
transformers >= 4.56.0
tokenizers >= 0.21.1 # Required for fast incremental detokenization.
protobuf # Required by LlamaTokenizer.
fastapi[standard] >= 0.115.0 # Required by FastAPI's form models in the OpenAI API server's audio transcriptions endpoint.
......
......@@ -334,7 +334,7 @@ class HfRunner:
trust_remote_code=trust_remote_code,
)
self.device = self.get_default_device()
self.dtype = torch_dtype = _get_and_verify_dtype(
self.dtype = dtype = _get_and_verify_dtype(
self.model_name,
self.config,
dtype=dtype,
......@@ -342,7 +342,7 @@ class HfRunner:
)
model_kwargs = model_kwargs if model_kwargs is not None else {}
model_kwargs.setdefault("torch_dtype", torch_dtype)
model_kwargs.setdefault("dtype", dtype)
if is_sentence_transformer:
# Lazy init required for AMD CI
......@@ -388,7 +388,7 @@ class HfRunner:
if not skip_tokenizer_init:
self.tokenizer = AutoTokenizer.from_pretrained(
model_name,
torch_dtype=torch_dtype,
dtype=dtype,
trust_remote_code=trust_remote_code,
)
......@@ -398,7 +398,7 @@ class HfRunner:
self.processor = AutoProcessor.from_pretrained(
model_name,
torch_dtype=torch_dtype,
dtype=dtype,
trust_remote_code=trust_remote_code,
)
if skip_tokenizer_init:
......
......@@ -38,7 +38,7 @@ def run_intern_vit_test(
config.norm_type = "rms_norm"
hf_model = AutoModel.from_pretrained(
model, torch_dtype=torch_dtype, trust_remote_code=True
model, dtype=torch_dtype, trust_remote_code=True
).to("cuda")
hf_outputs_per_image = [
hf_model(pixel_value.to("cuda")).last_hidden_state
......
......@@ -45,7 +45,7 @@ def run_radio_test(
hf_model = AutoModel.from_pretrained(
model_id,
config=config,
torch_dtype=torch_dtype,
dtype=torch_dtype,
trust_remote_code=True,
).to("cuda")
hf_model.eval()
......
......@@ -251,7 +251,7 @@ def run_hf(
disable_detokenize: bool = False,
) -> float:
llm = AutoModelForCausalLM.from_pretrained(
model, torch_dtype=torch.float16, trust_remote_code=trust_remote_code
model, dtype=torch.float16, trust_remote_code=trust_remote_code
)
if llm.config.model_type == "llama":
# To enable padding in the HF backend.
......
......@@ -1837,18 +1837,18 @@ def _find_dtype(
*,
revision: str | None,
):
# NOTE: getattr(config, "torch_dtype", torch.float32) is not correct
# because config.torch_dtype can be None.
config_dtype = getattr(config, "torch_dtype", None)
# NOTE: getattr(config, "dtype", torch.float32) is not correct
# because config.dtype can be None.
config_dtype = getattr(config, "dtype", None)
# Fallbacks for multi-modal models if the root config
# does not define torch_dtype
# does not define dtype
if config_dtype is None:
config_dtype = getattr(config.get_text_config(), "torch_dtype", None)
config_dtype = getattr(config.get_text_config(), "dtype", None)
if config_dtype is None and hasattr(config, "vision_config"):
config_dtype = getattr(config.vision_config, "torch_dtype", None)
config_dtype = getattr(config.vision_config, "dtype", None)
if config_dtype is None and hasattr(config, "encoder_config"):
config_dtype = getattr(config.encoder_config, "torch_dtype", None)
config_dtype = getattr(config.encoder_config, "dtype", None)
# Try to read the dtype of the weights if they are in safetensors format
if config_dtype is None:
......
......@@ -117,9 +117,8 @@ class LLM:
execution with tensor parallelism.
dtype: The data type for the model weights and activations. Currently,
we support `float32`, `float16`, and `bfloat16`. If `auto`, we use
the `torch_dtype` attribute specified in the model config file.
However, if the `torch_dtype` in the config is `float32`, we will
use `float16` instead.
the `dtype` attribute of the Transformers model's config. However,
if the `dtype` in the config is `float32`, we will use `float16` instead.
quantization: The method used to quantize the model weights. Currently,
we support "awq", "gptq", and "fp8" (experimental).
If None, we first check the `quantization_config` attribute in the
......
......@@ -518,7 +518,7 @@ def init_tensorizer_model(
) -> nn.Module:
assert tensorizer_config.hf_config is not None
model_args = tensorizer_config.hf_config
model_args.torch_dtype = tensorizer_config.dtype
model_args.dtype = tensorizer_config.dtype
assert tensorizer_config.model_class is not None
# TODO: Do we need to consider old-style model class?
with meta_tensor_mode(), set_current_vllm_config(vllm_config, check_compile=True):
......
......@@ -999,7 +999,7 @@ class ChameleonForConditionalGeneration(
return []
assert self.model.vqmodel is not None
image_tokens = self.model.get_image_tokens(
image_input["data"].to(self.config.torch_dtype)
image_input["data"].to(self.config.dtype)
)
vision_embeddings = self.model.get_input_embeddings(image_tokens)
return vision_embeddings
......
......@@ -1089,7 +1089,7 @@ class Ernie4_5VLMultiModalProcessor(BaseMultiModalProcessor[Ernie4_5_VLProcessin
pixel_values = (
rescale_factor * pixel_values.to(torch.float32) - image_mean_tensor
) / image_std_tensor
pixel_values = pixel_values.to(hf_config.torch_dtype)
pixel_values = pixel_values.to(hf_config.dtype)
return pixel_values
def _call_hf_processor(
......
......@@ -615,7 +615,7 @@ class GLM4VForCausalLM(
return None
def _process_image_input(self, image_input: GLMVImagePixelInputs) -> torch.Tensor:
pixel_values = image_input["data"].to(dtype=self.config.torch_dtype)
pixel_values = image_input["data"].to(dtype=self.config.dtype)
return self.transformer.vision(pixel_values)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment