Fix typo and format BasicTransformerBlock attributes (#2953)

* ⚙️chore(train_controlnet) fix typo in logger message * ⚙️chore(models) refactor modules order; make them the same as calling order When printing the BasicTransformerBlock to stdout, I think it's crucial that the attributes order are shown in proper order. And also previously the "3. Feed Forward" comment was not making sense. It should have been close to self.ff but it's instead next to self.norm3 * correct many tests * remove bogus file * make style * correct more tests * finish tests * fix one more * make style * make unclip deterministic * ⚙ ️chore(models/attention) reorganize comments in BasicTransformerBlock class --------- Co-authored-by: Patrick von Platen <patrick.v.platen@gmail.com>

Fix typo and format BasicTransformerBlock attributes (#2953)
* ⚙️chore(train_controlnet) fix typo in logger message * ⚙️chore(models) refactor modules order; make them the same as calling order When printing the BasicTransformerBlock to stdout, I think it's crucial that the attributes order are shown in proper order. And also previously the "3. Feed Forward" comment was not making sense. It should have been close to self.ff but it's instead next to self.norm3 * correct many tests * remove bogus file * make style * correct more tests * finish tests * fix one more * make style * make unclip deterministic * ⚙ ️chore(models/attention) reorganize comments in BasicTransformerBlock class --------- Co-authored-by: Patrick von Platen <patrick.v.platen@gmail.com>
52c4d32d · Chanchana Sornsoontorn · GitHub · c6180a31 · 52c4d32d · 52c4d32d
Unverified Commit 52c4d32d authored Apr 12, 2023 by Chanchana Sornsoontorn Committed by GitHub Apr 12, 2023
5 changed files
--- a/tests/pipelines/stable_unclip/test_stable_unclip_img2img.py
+++ b/tests/pipelines/stable_unclip/test_stable_unclip_img2img.py
@@ -47,6 +47,7 @@ class StableUnCLIPImg2ImgPipelineFastTests(PipelineTesterMixin, unittest.TestCas
        feature_extractor = CLIPImageProcessor(crop_size=32, size=32)
+        torch.manual_seed(0)
        image_encoder = CLIPVisionModelWithProjection(
            CLIPVisionConfig(
                hidden_size=embedder_hidden_size,
@@ -119,16 +120,16 @@ class StableUnCLIPImg2ImgPipelineFastTests(PipelineTesterMixin, unittest.TestCas
        components = {
            # image encoding components
            "feature_extractor": feature_extractor,
-            "image_encoder": image_encoder,
+            "image_encoder": image_encoder.eval(),
            # image noising components
-            "image_normalizer": image_normalizer,
+            "image_normalizer": image_normalizer.eval(),
            "image_noising_scheduler": image_noising_scheduler,
            # regular denoising components
            "tokenizer": tokenizer,
-            "text_encoder": text_encoder,
+            "text_encoder": text_encoder.eval(),
-            "unet": unet,
+            "unet": unet.eval(),
            "scheduler": scheduler,
-            "vae": vae,
+            "vae": vae.eval(),
        }
        return components
@@ -169,9 +170,7 @@ class StableUnCLIPImg2ImgPipelineFastTests(PipelineTesterMixin, unittest.TestCas
        image_slice = image[0, -3:, -3:, -1]
        assert image.shape == (1, 32, 32, 3)
-        expected_slice = np.array(
+        expected_slice = np.array([0.3872, 0.7224, 0.5601, 0.4741, 0.6872, 0.5814, 0.4636, 0.3867, 0.5078])
-            [0.34588397, 0.7747054, 0.5453714, 0.5227859, 0.57656777, 0.6532228, 0.5177634, 0.49932978, 0.56626225]
-        )
        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-3

--- a/tests/pipelines/text_to_video/test_text_to_video.py
+++ b/tests/pipelines/text_to_video/test_text_to_video.py
@@ -135,7 +135,7 @@ class TextToVideoSDPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
        image_slice = frames[0][-3:, -3:, -1]
        assert frames[0].shape == (64, 64, 3)
-        expected_slice = np.array([166, 184, 167, 118, 102, 123, 108, 93, 114])
+        expected_slice = np.array([158.0, 160.0, 153.0, 125.0, 100.0, 121.0, 111.0, 93.0, 113.0])
        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2

--- a/tests/pipelines/vq_diffusion/test_vq_diffusion.py
+++ b/tests/pipelines/vq_diffusion/test_vq_diffusion.py
@@ -143,7 +143,7 @@ class VQDiffusionPipelineFastTests(unittest.TestCase):
        assert image.shape == (1, 24, 24, 3)
-        expected_slice = np.array([0.6583, 0.6410, 0.5325, 0.5635, 0.5563, 0.4234, 0.6008, 0.5491, 0.4880])
+        expected_slice = np.array([0.6551, 0.6168, 0.5008, 0.5676, 0.5659, 0.4295, 0.6073, 0.5599, 0.4992])
        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
        assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2
@@ -187,7 +187,7 @@ class VQDiffusionPipelineFastTests(unittest.TestCase):
        assert image.shape == (1, 24, 24, 3)
-        expected_slice = np.array([0.6647, 0.6531, 0.5303, 0.5891, 0.5726, 0.4439, 0.6304, 0.5564, 0.4912])
+        expected_slice = np.array([0.6693, 0.6075, 0.4959, 0.5701, 0.5583, 0.4333, 0.6171, 0.5684, 0.4988])
        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
        assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2

--- a/tests/test_layers_utils.py
+++ b/tests/test_layers_utils.py
@@ -411,10 +411,7 @@ class Transformer2DModelTests(unittest.TestCase):
        assert attention_scores.shape == (1, 64, 64, 64)
        output_slice = attention_scores[0, -1, -3:, -3:]
+        expected_slice = torch.tensor([0.0143, -0.6909, -2.1547, -1.8893, 1.4097, 0.1359, -0.2521, -1.3359, 0.2598])
-        expected_slice = torch.tensor(
-            [-0.2555, -0.8877, -2.4739, -2.2251, 1.2714, 0.0807, -0.4161, -1.6408, -0.0471], device=torch_device
-        )
        assert torch.allclose(output_slice.flatten(), expected_slice, atol=1e-3)
    def test_spatial_transformer_timestep(self):
@@ -445,14 +442,12 @@ class Transformer2DModelTests(unittest.TestCase):
        output_slice_1 = attention_scores_1[0, -1, -3:, -3:]
        output_slice_2 = attention_scores_2[0, -1, -3:, -3:]
-        expected_slice_1 = torch.tensor(
+        expected_slice = torch.tensor([-0.3923, -1.0923, -1.7144, -1.5570, 1.4154, 0.1738, -0.1157, -1.2998, -0.1703])
-            [-0.1874, -0.9704, -1.4290, -1.3357, 1.5138, 0.3036, -0.0976, -1.1667, 0.1283], device=torch_device
-        )
        expected_slice_2 = torch.tensor(
-            [-0.3493, -1.0924, -1.6161, -1.5016, 1.4245, 0.1367, -0.2526, -1.3109, -0.0547], device=torch_device
+            [-0.4311, -1.1376, -1.7732, -1.5997, 1.3450, 0.0964, -0.1569, -1.3590, -0.2348]
        )
-        assert torch.allclose(output_slice_1.flatten(), expected_slice_1, atol=1e-3)
+        assert torch.allclose(output_slice_1.flatten(), expected_slice, atol=1e-3)
        assert torch.allclose(output_slice_2.flatten(), expected_slice_2, atol=1e-3)
    def test_spatial_transformer_dropout(self):

--- a/tests/test_unet_2d_blocks.py
+++ b/tests/test_unet_2d_blocks.py
@@ -57,7 +57,7 @@ class CrossAttnDownBlock2DTests(UNetBlockTesterMixin, unittest.TestCase):
        return init_dict, inputs_dict
    def test_output(self):
-        expected_slice = [0.2440, -0.6953, -0.2140, -0.3874, 0.1966, 1.2077, 0.0441, -0.7718, 0.2800]
+        expected_slice = [0.2238, -0.7396, -0.2255, -0.3829, 0.1925, 1.1665, 0.0603, -0.7295, 0.1983]
        super().test_output(expected_slice)
@@ -175,7 +175,7 @@ class UNetMidBlock2DCrossAttnTests(UNetBlockTesterMixin, unittest.TestCase):
        return init_dict, inputs_dict
    def test_output(self):
-        expected_slice = [0.1879, 2.2653, 0.5987, 1.1568, -0.8454, -1.6109, -0.8919, 0.8306, 1.6758]
+        expected_slice = [0.0187, 2.4220, 0.4484, 1.1203, -0.6121, -1.5122, -0.8270, 0.7851, 1.8335]
        super().test_output(expected_slice)
@@ -237,7 +237,7 @@ class CrossAttnUpBlock2DTests(UNetBlockTesterMixin, unittest.TestCase):
        return init_dict, inputs_dict
    def test_output(self):
-        expected_slice = [-0.2796, -0.4364, -0.1067, -0.2693, 0.1894, 0.3869, -0.3470, 0.4584, 0.5091]
+        expected_slice = [-0.1403, -0.3515, -0.0420, -0.1425, 0.3167, 0.5094, -0.2181, 0.5931, 0.5582]
        super().test_output(expected_slice)