Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
renzhc
diffusers_dcu
Commits
ffc0eaab
Unverified
Commit
ffc0eaab
authored
Dec 23, 2024
by
Aryan
Committed by
GitHub
Dec 23, 2024
Browse files
Bump minimum TorchAO version to 0.7.0 (#10293)
* bump min torchao version to 0.7.0 * update
parent
3c2e2aa8
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
52 additions
and
51 deletions
+52
-51
src/diffusers/quantizers/torchao/torchao_quantizer.py
src/diffusers/quantizers/torchao/torchao_quantizer.py
+5
-0
src/diffusers/utils/testing_utils.py
src/diffusers/utils/testing_utils.py
+2
-2
tests/quantization/torchao/test_torchao.py
tests/quantization/torchao/test_torchao.py
+45
-49
No files found.
src/diffusers/quantizers/torchao/torchao_quantizer.py
View file @
ffc0eaab
...
@@ -93,6 +93,11 @@ class TorchAoHfQuantizer(DiffusersQuantizer):
...
@@ -93,6 +93,11 @@ class TorchAoHfQuantizer(DiffusersQuantizer):
raise
ImportError
(
raise
ImportError
(
"Loading a TorchAO quantized model requires the torchao library. Please install with `pip install torchao`"
"Loading a TorchAO quantized model requires the torchao library. Please install with `pip install torchao`"
)
)
torchao_version
=
version
.
parse
(
importlib
.
metadata
.
version
(
"torch"
))
if
torchao_version
<
version
.
parse
(
"0.7.0"
):
raise
RuntimeError
(
f
"The minimum required version of `torchao` is 0.7.0, but the current version is
{
torchao_version
}
. Please upgrade with `pip install -U torchao`."
)
self
.
offload
=
False
self
.
offload
=
False
...
...
src/diffusers/utils/testing_utils.py
View file @
ffc0eaab
...
@@ -490,11 +490,11 @@ def require_gguf_version_greater_or_equal(gguf_version):
...
@@ -490,11 +490,11 @@ def require_gguf_version_greater_or_equal(gguf_version):
return
decorator
return
decorator
def
require_torchao_version_greater
(
torchao_version
):
def
require_torchao_version_greater
_or_equal
(
torchao_version
):
def
decorator
(
test_case
):
def
decorator
(
test_case
):
correct_torchao_version
=
is_torchao_available
()
and
version
.
parse
(
correct_torchao_version
=
is_torchao_available
()
and
version
.
parse
(
version
.
parse
(
importlib
.
metadata
.
version
(
"torchao"
)).
base_version
version
.
parse
(
importlib
.
metadata
.
version
(
"torchao"
)).
base_version
)
>
version
.
parse
(
torchao_version
)
)
>
=
version
.
parse
(
torchao_version
)
return
unittest
.
skipUnless
(
return
unittest
.
skipUnless
(
correct_torchao_version
,
f
"Test requires torchao with version greater than
{
torchao_version
}
."
correct_torchao_version
,
f
"Test requires torchao with version greater than
{
torchao_version
}
."
)(
test_case
)
)(
test_case
)
...
...
tests/quantization/torchao/test_torchao.py
View file @
ffc0eaab
...
@@ -36,7 +36,7 @@ from diffusers.utils.testing_utils import (
...
@@ -36,7 +36,7 @@ from diffusers.utils.testing_utils import (
nightly
,
nightly
,
require_torch
,
require_torch
,
require_torch_gpu
,
require_torch_gpu
,
require_torchao_version_greater
,
require_torchao_version_greater
_or_equal
,
slow
,
slow
,
torch_device
,
torch_device
,
)
)
...
@@ -74,13 +74,13 @@ if is_torch_available():
...
@@ -74,13 +74,13 @@ if is_torch_available():
if
is_torchao_available
():
if
is_torchao_available
():
from
torchao.dtypes
import
AffineQuantizedTensor
from
torchao.dtypes
import
AffineQuantizedTensor
from
torchao.dtypes.affine_quantized_tensor
import
TensorCoreTiledLayoutType
from
torchao.quantization.linear_activation_quantized_tensor
import
LinearActivationQuantizedTensor
from
torchao.quantization.linear_activation_quantized_tensor
import
LinearActivationQuantizedTensor
from
torchao.utils
import
get_model_size_in_bytes
@
require_torch
@
require_torch
@
require_torch_gpu
@
require_torch_gpu
@
require_torchao_version_greater
(
"0.
6
.0"
)
@
require_torchao_version_greater
_or_equal
(
"0.
7
.0"
)
class
TorchAoConfigTest
(
unittest
.
TestCase
):
class
TorchAoConfigTest
(
unittest
.
TestCase
):
def
test_to_dict
(
self
):
def
test_to_dict
(
self
):
"""
"""
...
@@ -125,7 +125,7 @@ class TorchAoConfigTest(unittest.TestCase):
...
@@ -125,7 +125,7 @@ class TorchAoConfigTest(unittest.TestCase):
# Slices for these tests have been obtained on our aws-g6e-xlarge-plus runners
# Slices for these tests have been obtained on our aws-g6e-xlarge-plus runners
@
require_torch
@
require_torch
@
require_torch_gpu
@
require_torch_gpu
@
require_torchao_version_greater
(
"0.
6
.0"
)
@
require_torchao_version_greater
_or_equal
(
"0.
7
.0"
)
class
TorchAoTest
(
unittest
.
TestCase
):
class
TorchAoTest
(
unittest
.
TestCase
):
def
tearDown
(
self
):
def
tearDown
(
self
):
gc
.
collect
()
gc
.
collect
()
...
@@ -139,11 +139,13 @@ class TorchAoTest(unittest.TestCase):
...
@@ -139,11 +139,13 @@ class TorchAoTest(unittest.TestCase):
quantization_config
=
quantization_config
,
quantization_config
=
quantization_config
,
torch_dtype
=
torch
.
bfloat16
,
torch_dtype
=
torch
.
bfloat16
,
)
)
text_encoder
=
CLIPTextModel
.
from_pretrained
(
model_id
,
subfolder
=
"text_encoder"
)
text_encoder
=
CLIPTextModel
.
from_pretrained
(
model_id
,
subfolder
=
"text_encoder"
,
torch_dtype
=
torch
.
bfloat16
)
text_encoder_2
=
T5EncoderModel
.
from_pretrained
(
model_id
,
subfolder
=
"text_encoder_2"
)
text_encoder_2
=
T5EncoderModel
.
from_pretrained
(
model_id
,
subfolder
=
"text_encoder_2"
,
torch_dtype
=
torch
.
bfloat16
)
tokenizer
=
CLIPTokenizer
.
from_pretrained
(
model_id
,
subfolder
=
"tokenizer"
)
tokenizer
=
CLIPTokenizer
.
from_pretrained
(
model_id
,
subfolder
=
"tokenizer"
)
tokenizer_2
=
AutoTokenizer
.
from_pretrained
(
model_id
,
subfolder
=
"tokenizer_2"
)
tokenizer_2
=
AutoTokenizer
.
from_pretrained
(
model_id
,
subfolder
=
"tokenizer_2"
)
vae
=
AutoencoderKL
.
from_pretrained
(
model_id
,
subfolder
=
"vae"
)
vae
=
AutoencoderKL
.
from_pretrained
(
model_id
,
subfolder
=
"vae"
,
torch_dtype
=
torch
.
bfloat16
)
scheduler
=
FlowMatchEulerDiscreteScheduler
()
scheduler
=
FlowMatchEulerDiscreteScheduler
()
return
{
return
{
...
@@ -212,7 +214,7 @@ class TorchAoTest(unittest.TestCase):
...
@@ -212,7 +214,7 @@ class TorchAoTest(unittest.TestCase):
def
_test_quant_type
(
self
,
quantization_config
:
TorchAoConfig
,
expected_slice
:
List
[
float
]):
def
_test_quant_type
(
self
,
quantization_config
:
TorchAoConfig
,
expected_slice
:
List
[
float
]):
components
=
self
.
get_dummy_components
(
quantization_config
)
components
=
self
.
get_dummy_components
(
quantization_config
)
pipe
=
FluxPipeline
(
**
components
)
pipe
=
FluxPipeline
(
**
components
)
pipe
.
to
(
device
=
torch_device
,
dtype
=
torch
.
bfloat16
)
pipe
.
to
(
device
=
torch_device
)
inputs
=
self
.
get_dummy_inputs
(
torch_device
)
inputs
=
self
.
get_dummy_inputs
(
torch_device
)
output
=
pipe
(
**
inputs
)[
0
]
output
=
pipe
(
**
inputs
)[
0
]
...
@@ -276,7 +278,6 @@ class TorchAoTest(unittest.TestCase):
...
@@ -276,7 +278,6 @@ class TorchAoTest(unittest.TestCase):
self
.
assertTrue
(
isinstance
(
weight
,
AffineQuantizedTensor
))
self
.
assertTrue
(
isinstance
(
weight
,
AffineQuantizedTensor
))
self
.
assertEqual
(
weight
.
quant_min
,
0
)
self
.
assertEqual
(
weight
.
quant_min
,
0
)
self
.
assertEqual
(
weight
.
quant_max
,
15
)
self
.
assertEqual
(
weight
.
quant_max
,
15
)
self
.
assertTrue
(
isinstance
(
weight
.
layout_type
,
TensorCoreTiledLayoutType
))
def
test_device_map
(
self
):
def
test_device_map
(
self
):
"""
"""
...
@@ -341,21 +342,33 @@ class TorchAoTest(unittest.TestCase):
...
@@ -341,21 +342,33 @@ class TorchAoTest(unittest.TestCase):
def
test_modules_to_not_convert
(
self
):
def
test_modules_to_not_convert
(
self
):
quantization_config
=
TorchAoConfig
(
"int8_weight_only"
,
modules_to_not_convert
=
[
"transformer_blocks.0"
])
quantization_config
=
TorchAoConfig
(
"int8_weight_only"
,
modules_to_not_convert
=
[
"transformer_blocks.0"
])
quantized_model
=
FluxTransformer2DModel
.
from_pretrained
(
quantized_model
_with_not_convert
=
FluxTransformer2DModel
.
from_pretrained
(
"hf-internal-testing/tiny-flux-pipe"
,
"hf-internal-testing/tiny-flux-pipe"
,
subfolder
=
"transformer"
,
subfolder
=
"transformer"
,
quantization_config
=
quantization_config
,
quantization_config
=
quantization_config
,
torch_dtype
=
torch
.
bfloat16
,
torch_dtype
=
torch
.
bfloat16
,
)
)
unquantized_layer
=
quantized_model
.
transformer_blocks
[
0
].
ff
.
net
[
2
]
unquantized_layer
=
quantized_model
_with_not_convert
.
transformer_blocks
[
0
].
ff
.
net
[
2
]
self
.
assertTrue
(
isinstance
(
unquantized_layer
,
torch
.
nn
.
Linear
))
self
.
assertTrue
(
isinstance
(
unquantized_layer
,
torch
.
nn
.
Linear
))
self
.
assertFalse
(
isinstance
(
unquantized_layer
.
weight
,
AffineQuantizedTensor
))
self
.
assertFalse
(
isinstance
(
unquantized_layer
.
weight
,
AffineQuantizedTensor
))
self
.
assertEqual
(
unquantized_layer
.
weight
.
dtype
,
torch
.
bfloat16
)
self
.
assertEqual
(
unquantized_layer
.
weight
.
dtype
,
torch
.
bfloat16
)
quantized_layer
=
quantized_model
.
proj_out
quantized_layer
=
quantized_model
_with_not_convert
.
proj_out
self
.
assertTrue
(
isinstance
(
quantized_layer
.
weight
,
AffineQuantizedTensor
))
self
.
assertTrue
(
isinstance
(
quantized_layer
.
weight
,
AffineQuantizedTensor
))
self
.
assertEqual
(
quantized_layer
.
weight
.
layout_tensor
.
data
.
dtype
,
torch
.
int8
)
quantization_config
=
TorchAoConfig
(
"int8_weight_only"
)
quantized_model
=
FluxTransformer2DModel
.
from_pretrained
(
"hf-internal-testing/tiny-flux-pipe"
,
subfolder
=
"transformer"
,
quantization_config
=
quantization_config
,
torch_dtype
=
torch
.
bfloat16
,
)
size_quantized_with_not_convert
=
get_model_size_in_bytes
(
quantized_model_with_not_convert
)
size_quantized
=
get_model_size_in_bytes
(
quantized_model
)
self
.
assertTrue
(
size_quantized
<
size_quantized_with_not_convert
)
def
test_training
(
self
):
def
test_training
(
self
):
quantization_config
=
TorchAoConfig
(
"int8_weight_only"
)
quantization_config
=
TorchAoConfig
(
"int8_weight_only"
)
...
@@ -406,23 +419,6 @@ class TorchAoTest(unittest.TestCase):
...
@@ -406,23 +419,6 @@ class TorchAoTest(unittest.TestCase):
# Note: Seems to require higher tolerance
# Note: Seems to require higher tolerance
self
.
assertTrue
(
np
.
allclose
(
normal_output
,
compile_output
,
atol
=
1e-2
,
rtol
=
1e-3
))
self
.
assertTrue
(
np
.
allclose
(
normal_output
,
compile_output
,
atol
=
1e-2
,
rtol
=
1e-3
))
@
staticmethod
def
_get_memory_footprint
(
module
):
quantized_param_memory
=
0.0
unquantized_param_memory
=
0.0
for
param
in
module
.
parameters
():
if
param
.
__class__
.
__name__
==
"AffineQuantizedTensor"
:
data
,
scale
,
zero_point
=
param
.
layout_tensor
.
get_plain
()
quantized_param_memory
+=
data
.
numel
()
+
data
.
element_size
()
quantized_param_memory
+=
scale
.
numel
()
+
scale
.
element_size
()
quantized_param_memory
+=
zero_point
.
numel
()
+
zero_point
.
element_size
()
else
:
unquantized_param_memory
+=
param
.
data
.
numel
()
*
param
.
data
.
element_size
()
total_memory
=
quantized_param_memory
+
unquantized_param_memory
return
total_memory
,
quantized_param_memory
,
unquantized_param_memory
def
test_memory_footprint
(
self
):
def
test_memory_footprint
(
self
):
r
"""
r
"""
A simple test to check if the model conversion has been done correctly by checking on the
A simple test to check if the model conversion has been done correctly by checking on the
...
@@ -433,20 +429,18 @@ class TorchAoTest(unittest.TestCase):
...
@@ -433,20 +429,18 @@ class TorchAoTest(unittest.TestCase):
transformer_int8wo
=
self
.
get_dummy_components
(
TorchAoConfig
(
"int8wo"
))[
"transformer"
]
transformer_int8wo
=
self
.
get_dummy_components
(
TorchAoConfig
(
"int8wo"
))[
"transformer"
]
transformer_bf16
=
self
.
get_dummy_components
(
None
)[
"transformer"
]
transformer_bf16
=
self
.
get_dummy_components
(
None
)[
"transformer"
]
total_int4wo
,
quantized_int4wo
,
unquantized_int4wo
=
self
.
_get_memory_footprint
(
transformer_int4wo
)
total_int4wo
=
get_model_size_in_bytes
(
transformer_int4wo
)
total_int4wo_gs32
,
quantized_int4wo_gs32
,
unquantized_int4wo_gs32
=
self
.
_get_memory_footprint
(
total_int4wo_gs32
=
get_model_size_in_bytes
(
transformer_int4wo_gs32
)
transformer_int4wo_gs32
total_int8wo
=
get_model_size_in_bytes
(
transformer_int8wo
)
)
total_bf16
=
get_model_size_in_bytes
(
transformer_bf16
)
total_int8wo
,
quantized_int8wo
,
unquantized_int8wo
=
self
.
_get_memory_footprint
(
transformer_int8wo
)
total_bf16
,
quantized_bf16
,
unquantized_bf16
=
self
.
_get_memory_footprint
(
transformer_bf16
)
# Latter has smaller group size, so more groups -> more scales and zero points
self
.
assertTrue
(
total_int4wo
<
total_int4wo_gs32
)
self
.
assertTrue
(
quantized_bf16
==
0
and
total_bf16
==
unquantized_bf16
)
# int4wo_gs32 has smaller group size, so more groups -> more scales and zero points
self
.
assertTrue
(
total_int8wo
<
total_bf16
<
total_int4wo_gs32
)
# int4 with default group size quantized very few linear layers compared to a smaller group size of 32
self
.
assertTrue
(
quantized_int4wo
<
quantized_int4wo_gs32
and
unquantized_int4wo
>
unquantized_int4wo_gs32
)
# int8 quantizes more layers compare to int4 with default group size
# int8 quantizes more layers compare to int4 with default group size
self
.
assertTrue
(
quantized_int8wo
<
quantized_int4wo
)
self
.
assertTrue
(
total_int8wo
<
total_int4wo
)
# int4wo does not quantize too many layers because of default group size, but for the layers it does
# there is additional overhead of scales and zero points
self
.
assertTrue
(
total_bf16
<
total_int4wo
)
def
test_wrong_config
(
self
):
def
test_wrong_config
(
self
):
with
self
.
assertRaises
(
ValueError
):
with
self
.
assertRaises
(
ValueError
):
...
@@ -456,7 +450,7 @@ class TorchAoTest(unittest.TestCase):
...
@@ -456,7 +450,7 @@ class TorchAoTest(unittest.TestCase):
# This class is not to be run as a test by itself. See the tests that follow this class
# This class is not to be run as a test by itself. See the tests that follow this class
@
require_torch
@
require_torch
@
require_torch_gpu
@
require_torch_gpu
@
require_torchao_version_greater
(
"0.
6
.0"
)
@
require_torchao_version_greater
_or_equal
(
"0.
7
.0"
)
class
TorchAoSerializationTest
(
unittest
.
TestCase
):
class
TorchAoSerializationTest
(
unittest
.
TestCase
):
model_name
=
"hf-internal-testing/tiny-flux-pipe"
model_name
=
"hf-internal-testing/tiny-flux-pipe"
quant_method
,
quant_method_kwargs
=
None
,
None
quant_method
,
quant_method_kwargs
=
None
,
None
...
@@ -565,7 +559,7 @@ class TorchAoSerializationINTA16W8CPUTest(TorchAoSerializationTest):
...
@@ -565,7 +559,7 @@ class TorchAoSerializationINTA16W8CPUTest(TorchAoSerializationTest):
# Slices for these tests have been obtained on our aws-g6e-xlarge-plus runners
# Slices for these tests have been obtained on our aws-g6e-xlarge-plus runners
@
require_torch
@
require_torch
@
require_torch_gpu
@
require_torch_gpu
@
require_torchao_version_greater
(
"0.
6
.0"
)
@
require_torchao_version_greater
_or_equal
(
"0.
7
.0"
)
@
slow
@
slow
@
nightly
@
nightly
class
SlowTorchAoTests
(
unittest
.
TestCase
):
class
SlowTorchAoTests
(
unittest
.
TestCase
):
...
@@ -581,11 +575,13 @@ class SlowTorchAoTests(unittest.TestCase):
...
@@ -581,11 +575,13 @@ class SlowTorchAoTests(unittest.TestCase):
quantization_config
=
quantization_config
,
quantization_config
=
quantization_config
,
torch_dtype
=
torch
.
bfloat16
,
torch_dtype
=
torch
.
bfloat16
,
)
)
text_encoder
=
CLIPTextModel
.
from_pretrained
(
model_id
,
subfolder
=
"text_encoder"
)
text_encoder
=
CLIPTextModel
.
from_pretrained
(
model_id
,
subfolder
=
"text_encoder"
,
torch_dtype
=
torch
.
bfloat16
)
text_encoder_2
=
T5EncoderModel
.
from_pretrained
(
model_id
,
subfolder
=
"text_encoder_2"
)
text_encoder_2
=
T5EncoderModel
.
from_pretrained
(
model_id
,
subfolder
=
"text_encoder_2"
,
torch_dtype
=
torch
.
bfloat16
)
tokenizer
=
CLIPTokenizer
.
from_pretrained
(
model_id
,
subfolder
=
"tokenizer"
)
tokenizer
=
CLIPTokenizer
.
from_pretrained
(
model_id
,
subfolder
=
"tokenizer"
)
tokenizer_2
=
AutoTokenizer
.
from_pretrained
(
model_id
,
subfolder
=
"tokenizer_2"
)
tokenizer_2
=
AutoTokenizer
.
from_pretrained
(
model_id
,
subfolder
=
"tokenizer_2"
)
vae
=
AutoencoderKL
.
from_pretrained
(
model_id
,
subfolder
=
"vae"
)
vae
=
AutoencoderKL
.
from_pretrained
(
model_id
,
subfolder
=
"vae"
,
torch_dtype
=
torch
.
bfloat16
)
scheduler
=
FlowMatchEulerDiscreteScheduler
()
scheduler
=
FlowMatchEulerDiscreteScheduler
()
return
{
return
{
...
@@ -617,7 +613,7 @@ class SlowTorchAoTests(unittest.TestCase):
...
@@ -617,7 +613,7 @@ class SlowTorchAoTests(unittest.TestCase):
def
_test_quant_type
(
self
,
quantization_config
,
expected_slice
):
def
_test_quant_type
(
self
,
quantization_config
,
expected_slice
):
components
=
self
.
get_dummy_components
(
quantization_config
)
components
=
self
.
get_dummy_components
(
quantization_config
)
pipe
=
FluxPipeline
(
**
components
)
.
to
(
dtype
=
torch
.
bfloat16
)
pipe
=
FluxPipeline
(
**
components
)
pipe
.
enable_model_cpu_offload
()
pipe
.
enable_model_cpu_offload
()
inputs
=
self
.
get_dummy_inputs
(
torch_device
)
inputs
=
self
.
get_dummy_inputs
(
torch_device
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment