"git@developer.sourcefind.cn:OpenDAS/dgl.git" did not exist on "097ed7e200f4a1219464d0ac92f07c54ae3a60ef"
Unverified Commit ffc0eaab authored by Aryan's avatar Aryan Committed by GitHub
Browse files

Bump minimum TorchAO version to 0.7.0 (#10293)

* bump min torchao version to 0.7.0

* update
parent 3c2e2aa8
...@@ -93,6 +93,11 @@ class TorchAoHfQuantizer(DiffusersQuantizer): ...@@ -93,6 +93,11 @@ class TorchAoHfQuantizer(DiffusersQuantizer):
raise ImportError( raise ImportError(
"Loading a TorchAO quantized model requires the torchao library. Please install with `pip install torchao`" "Loading a TorchAO quantized model requires the torchao library. Please install with `pip install torchao`"
) )
torchao_version = version.parse(importlib.metadata.version("torch"))
if torchao_version < version.parse("0.7.0"):
raise RuntimeError(
f"The minimum required version of `torchao` is 0.7.0, but the current version is {torchao_version}. Please upgrade with `pip install -U torchao`."
)
self.offload = False self.offload = False
......
...@@ -490,11 +490,11 @@ def require_gguf_version_greater_or_equal(gguf_version): ...@@ -490,11 +490,11 @@ def require_gguf_version_greater_or_equal(gguf_version):
return decorator return decorator
def require_torchao_version_greater(torchao_version): def require_torchao_version_greater_or_equal(torchao_version):
def decorator(test_case): def decorator(test_case):
correct_torchao_version = is_torchao_available() and version.parse( correct_torchao_version = is_torchao_available() and version.parse(
version.parse(importlib.metadata.version("torchao")).base_version version.parse(importlib.metadata.version("torchao")).base_version
) > version.parse(torchao_version) ) >= version.parse(torchao_version)
return unittest.skipUnless( return unittest.skipUnless(
correct_torchao_version, f"Test requires torchao with version greater than {torchao_version}." correct_torchao_version, f"Test requires torchao with version greater than {torchao_version}."
)(test_case) )(test_case)
......
...@@ -36,7 +36,7 @@ from diffusers.utils.testing_utils import ( ...@@ -36,7 +36,7 @@ from diffusers.utils.testing_utils import (
nightly, nightly,
require_torch, require_torch,
require_torch_gpu, require_torch_gpu,
require_torchao_version_greater, require_torchao_version_greater_or_equal,
slow, slow,
torch_device, torch_device,
) )
...@@ -74,13 +74,13 @@ if is_torch_available(): ...@@ -74,13 +74,13 @@ if is_torch_available():
if is_torchao_available(): if is_torchao_available():
from torchao.dtypes import AffineQuantizedTensor from torchao.dtypes import AffineQuantizedTensor
from torchao.dtypes.affine_quantized_tensor import TensorCoreTiledLayoutType
from torchao.quantization.linear_activation_quantized_tensor import LinearActivationQuantizedTensor from torchao.quantization.linear_activation_quantized_tensor import LinearActivationQuantizedTensor
from torchao.utils import get_model_size_in_bytes
@require_torch @require_torch
@require_torch_gpu @require_torch_gpu
@require_torchao_version_greater("0.6.0") @require_torchao_version_greater_or_equal("0.7.0")
class TorchAoConfigTest(unittest.TestCase): class TorchAoConfigTest(unittest.TestCase):
def test_to_dict(self): def test_to_dict(self):
""" """
...@@ -125,7 +125,7 @@ class TorchAoConfigTest(unittest.TestCase): ...@@ -125,7 +125,7 @@ class TorchAoConfigTest(unittest.TestCase):
# Slices for these tests have been obtained on our aws-g6e-xlarge-plus runners # Slices for these tests have been obtained on our aws-g6e-xlarge-plus runners
@require_torch @require_torch
@require_torch_gpu @require_torch_gpu
@require_torchao_version_greater("0.6.0") @require_torchao_version_greater_or_equal("0.7.0")
class TorchAoTest(unittest.TestCase): class TorchAoTest(unittest.TestCase):
def tearDown(self): def tearDown(self):
gc.collect() gc.collect()
...@@ -139,11 +139,13 @@ class TorchAoTest(unittest.TestCase): ...@@ -139,11 +139,13 @@ class TorchAoTest(unittest.TestCase):
quantization_config=quantization_config, quantization_config=quantization_config,
torch_dtype=torch.bfloat16, torch_dtype=torch.bfloat16,
) )
text_encoder = CLIPTextModel.from_pretrained(model_id, subfolder="text_encoder") text_encoder = CLIPTextModel.from_pretrained(model_id, subfolder="text_encoder", torch_dtype=torch.bfloat16)
text_encoder_2 = T5EncoderModel.from_pretrained(model_id, subfolder="text_encoder_2") text_encoder_2 = T5EncoderModel.from_pretrained(
model_id, subfolder="text_encoder_2", torch_dtype=torch.bfloat16
)
tokenizer = CLIPTokenizer.from_pretrained(model_id, subfolder="tokenizer") tokenizer = CLIPTokenizer.from_pretrained(model_id, subfolder="tokenizer")
tokenizer_2 = AutoTokenizer.from_pretrained(model_id, subfolder="tokenizer_2") tokenizer_2 = AutoTokenizer.from_pretrained(model_id, subfolder="tokenizer_2")
vae = AutoencoderKL.from_pretrained(model_id, subfolder="vae") vae = AutoencoderKL.from_pretrained(model_id, subfolder="vae", torch_dtype=torch.bfloat16)
scheduler = FlowMatchEulerDiscreteScheduler() scheduler = FlowMatchEulerDiscreteScheduler()
return { return {
...@@ -212,7 +214,7 @@ class TorchAoTest(unittest.TestCase): ...@@ -212,7 +214,7 @@ class TorchAoTest(unittest.TestCase):
def _test_quant_type(self, quantization_config: TorchAoConfig, expected_slice: List[float]): def _test_quant_type(self, quantization_config: TorchAoConfig, expected_slice: List[float]):
components = self.get_dummy_components(quantization_config) components = self.get_dummy_components(quantization_config)
pipe = FluxPipeline(**components) pipe = FluxPipeline(**components)
pipe.to(device=torch_device, dtype=torch.bfloat16) pipe.to(device=torch_device)
inputs = self.get_dummy_inputs(torch_device) inputs = self.get_dummy_inputs(torch_device)
output = pipe(**inputs)[0] output = pipe(**inputs)[0]
...@@ -276,7 +278,6 @@ class TorchAoTest(unittest.TestCase): ...@@ -276,7 +278,6 @@ class TorchAoTest(unittest.TestCase):
self.assertTrue(isinstance(weight, AffineQuantizedTensor)) self.assertTrue(isinstance(weight, AffineQuantizedTensor))
self.assertEqual(weight.quant_min, 0) self.assertEqual(weight.quant_min, 0)
self.assertEqual(weight.quant_max, 15) self.assertEqual(weight.quant_max, 15)
self.assertTrue(isinstance(weight.layout_type, TensorCoreTiledLayoutType))
def test_device_map(self): def test_device_map(self):
""" """
...@@ -341,21 +342,33 @@ class TorchAoTest(unittest.TestCase): ...@@ -341,21 +342,33 @@ class TorchAoTest(unittest.TestCase):
def test_modules_to_not_convert(self): def test_modules_to_not_convert(self):
quantization_config = TorchAoConfig("int8_weight_only", modules_to_not_convert=["transformer_blocks.0"]) quantization_config = TorchAoConfig("int8_weight_only", modules_to_not_convert=["transformer_blocks.0"])
quantized_model = FluxTransformer2DModel.from_pretrained( quantized_model_with_not_convert = FluxTransformer2DModel.from_pretrained(
"hf-internal-testing/tiny-flux-pipe", "hf-internal-testing/tiny-flux-pipe",
subfolder="transformer", subfolder="transformer",
quantization_config=quantization_config, quantization_config=quantization_config,
torch_dtype=torch.bfloat16, torch_dtype=torch.bfloat16,
) )
unquantized_layer = quantized_model.transformer_blocks[0].ff.net[2] unquantized_layer = quantized_model_with_not_convert.transformer_blocks[0].ff.net[2]
self.assertTrue(isinstance(unquantized_layer, torch.nn.Linear)) self.assertTrue(isinstance(unquantized_layer, torch.nn.Linear))
self.assertFalse(isinstance(unquantized_layer.weight, AffineQuantizedTensor)) self.assertFalse(isinstance(unquantized_layer.weight, AffineQuantizedTensor))
self.assertEqual(unquantized_layer.weight.dtype, torch.bfloat16) self.assertEqual(unquantized_layer.weight.dtype, torch.bfloat16)
quantized_layer = quantized_model.proj_out quantized_layer = quantized_model_with_not_convert.proj_out
self.assertTrue(isinstance(quantized_layer.weight, AffineQuantizedTensor)) self.assertTrue(isinstance(quantized_layer.weight, AffineQuantizedTensor))
self.assertEqual(quantized_layer.weight.layout_tensor.data.dtype, torch.int8)
quantization_config = TorchAoConfig("int8_weight_only")
quantized_model = FluxTransformer2DModel.from_pretrained(
"hf-internal-testing/tiny-flux-pipe",
subfolder="transformer",
quantization_config=quantization_config,
torch_dtype=torch.bfloat16,
)
size_quantized_with_not_convert = get_model_size_in_bytes(quantized_model_with_not_convert)
size_quantized = get_model_size_in_bytes(quantized_model)
self.assertTrue(size_quantized < size_quantized_with_not_convert)
def test_training(self): def test_training(self):
quantization_config = TorchAoConfig("int8_weight_only") quantization_config = TorchAoConfig("int8_weight_only")
...@@ -406,23 +419,6 @@ class TorchAoTest(unittest.TestCase): ...@@ -406,23 +419,6 @@ class TorchAoTest(unittest.TestCase):
# Note: Seems to require higher tolerance # Note: Seems to require higher tolerance
self.assertTrue(np.allclose(normal_output, compile_output, atol=1e-2, rtol=1e-3)) self.assertTrue(np.allclose(normal_output, compile_output, atol=1e-2, rtol=1e-3))
@staticmethod
def _get_memory_footprint(module):
quantized_param_memory = 0.0
unquantized_param_memory = 0.0
for param in module.parameters():
if param.__class__.__name__ == "AffineQuantizedTensor":
data, scale, zero_point = param.layout_tensor.get_plain()
quantized_param_memory += data.numel() + data.element_size()
quantized_param_memory += scale.numel() + scale.element_size()
quantized_param_memory += zero_point.numel() + zero_point.element_size()
else:
unquantized_param_memory += param.data.numel() * param.data.element_size()
total_memory = quantized_param_memory + unquantized_param_memory
return total_memory, quantized_param_memory, unquantized_param_memory
def test_memory_footprint(self): def test_memory_footprint(self):
r""" r"""
A simple test to check if the model conversion has been done correctly by checking on the A simple test to check if the model conversion has been done correctly by checking on the
...@@ -433,20 +429,18 @@ class TorchAoTest(unittest.TestCase): ...@@ -433,20 +429,18 @@ class TorchAoTest(unittest.TestCase):
transformer_int8wo = self.get_dummy_components(TorchAoConfig("int8wo"))["transformer"] transformer_int8wo = self.get_dummy_components(TorchAoConfig("int8wo"))["transformer"]
transformer_bf16 = self.get_dummy_components(None)["transformer"] transformer_bf16 = self.get_dummy_components(None)["transformer"]
total_int4wo, quantized_int4wo, unquantized_int4wo = self._get_memory_footprint(transformer_int4wo) total_int4wo = get_model_size_in_bytes(transformer_int4wo)
total_int4wo_gs32, quantized_int4wo_gs32, unquantized_int4wo_gs32 = self._get_memory_footprint( total_int4wo_gs32 = get_model_size_in_bytes(transformer_int4wo_gs32)
transformer_int4wo_gs32 total_int8wo = get_model_size_in_bytes(transformer_int8wo)
) total_bf16 = get_model_size_in_bytes(transformer_bf16)
total_int8wo, quantized_int8wo, unquantized_int8wo = self._get_memory_footprint(transformer_int8wo)
total_bf16, quantized_bf16, unquantized_bf16 = self._get_memory_footprint(transformer_bf16) # Latter has smaller group size, so more groups -> more scales and zero points
self.assertTrue(total_int4wo < total_int4wo_gs32)
self.assertTrue(quantized_bf16 == 0 and total_bf16 == unquantized_bf16)
# int4wo_gs32 has smaller group size, so more groups -> more scales and zero points
self.assertTrue(total_int8wo < total_bf16 < total_int4wo_gs32)
# int4 with default group size quantized very few linear layers compared to a smaller group size of 32
self.assertTrue(quantized_int4wo < quantized_int4wo_gs32 and unquantized_int4wo > unquantized_int4wo_gs32)
# int8 quantizes more layers compare to int4 with default group size # int8 quantizes more layers compare to int4 with default group size
self.assertTrue(quantized_int8wo < quantized_int4wo) self.assertTrue(total_int8wo < total_int4wo)
# int4wo does not quantize too many layers because of default group size, but for the layers it does
# there is additional overhead of scales and zero points
self.assertTrue(total_bf16 < total_int4wo)
def test_wrong_config(self): def test_wrong_config(self):
with self.assertRaises(ValueError): with self.assertRaises(ValueError):
...@@ -456,7 +450,7 @@ class TorchAoTest(unittest.TestCase): ...@@ -456,7 +450,7 @@ class TorchAoTest(unittest.TestCase):
# This class is not to be run as a test by itself. See the tests that follow this class # This class is not to be run as a test by itself. See the tests that follow this class
@require_torch @require_torch
@require_torch_gpu @require_torch_gpu
@require_torchao_version_greater("0.6.0") @require_torchao_version_greater_or_equal("0.7.0")
class TorchAoSerializationTest(unittest.TestCase): class TorchAoSerializationTest(unittest.TestCase):
model_name = "hf-internal-testing/tiny-flux-pipe" model_name = "hf-internal-testing/tiny-flux-pipe"
quant_method, quant_method_kwargs = None, None quant_method, quant_method_kwargs = None, None
...@@ -565,7 +559,7 @@ class TorchAoSerializationINTA16W8CPUTest(TorchAoSerializationTest): ...@@ -565,7 +559,7 @@ class TorchAoSerializationINTA16W8CPUTest(TorchAoSerializationTest):
# Slices for these tests have been obtained on our aws-g6e-xlarge-plus runners # Slices for these tests have been obtained on our aws-g6e-xlarge-plus runners
@require_torch @require_torch
@require_torch_gpu @require_torch_gpu
@require_torchao_version_greater("0.6.0") @require_torchao_version_greater_or_equal("0.7.0")
@slow @slow
@nightly @nightly
class SlowTorchAoTests(unittest.TestCase): class SlowTorchAoTests(unittest.TestCase):
...@@ -581,11 +575,13 @@ class SlowTorchAoTests(unittest.TestCase): ...@@ -581,11 +575,13 @@ class SlowTorchAoTests(unittest.TestCase):
quantization_config=quantization_config, quantization_config=quantization_config,
torch_dtype=torch.bfloat16, torch_dtype=torch.bfloat16,
) )
text_encoder = CLIPTextModel.from_pretrained(model_id, subfolder="text_encoder") text_encoder = CLIPTextModel.from_pretrained(model_id, subfolder="text_encoder", torch_dtype=torch.bfloat16)
text_encoder_2 = T5EncoderModel.from_pretrained(model_id, subfolder="text_encoder_2") text_encoder_2 = T5EncoderModel.from_pretrained(
model_id, subfolder="text_encoder_2", torch_dtype=torch.bfloat16
)
tokenizer = CLIPTokenizer.from_pretrained(model_id, subfolder="tokenizer") tokenizer = CLIPTokenizer.from_pretrained(model_id, subfolder="tokenizer")
tokenizer_2 = AutoTokenizer.from_pretrained(model_id, subfolder="tokenizer_2") tokenizer_2 = AutoTokenizer.from_pretrained(model_id, subfolder="tokenizer_2")
vae = AutoencoderKL.from_pretrained(model_id, subfolder="vae") vae = AutoencoderKL.from_pretrained(model_id, subfolder="vae", torch_dtype=torch.bfloat16)
scheduler = FlowMatchEulerDiscreteScheduler() scheduler = FlowMatchEulerDiscreteScheduler()
return { return {
...@@ -617,7 +613,7 @@ class SlowTorchAoTests(unittest.TestCase): ...@@ -617,7 +613,7 @@ class SlowTorchAoTests(unittest.TestCase):
def _test_quant_type(self, quantization_config, expected_slice): def _test_quant_type(self, quantization_config, expected_slice):
components = self.get_dummy_components(quantization_config) components = self.get_dummy_components(quantization_config)
pipe = FluxPipeline(**components).to(dtype=torch.bfloat16) pipe = FluxPipeline(**components)
pipe.enable_model_cpu_offload() pipe.enable_model_cpu_offload()
inputs = self.get_dummy_inputs(torch_device) inputs = self.get_dummy_inputs(torch_device)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment