Added support for EXIF orientation transform in read_image for PNG (#8303)

bdd690e5 · vfdev · GitHub · 5501bfe2 · bdd690e5 · bdd690e5
Unverified Commit bdd690e5 authored Mar 13, 2024 by vfdev Committed by GitHub Mar 13, 2024
8 changed files
--- a/test/test_image.py
+++ b/test/test_image.py
@@ -100,14 +100,15 @@ def test_decode_jpeg(img_path, pil_mode, mode):
    assert abs_mean_diff < 2
+@pytest.mark.parametrize("codec", ["png", "jpeg"])
 @pytest.mark.parametrize("orientation", [1, 2, 3, 4, 5, 6, 7, 8, 0])
-def test_decode_jpeg_with_exif_orientation(tmpdir, orientation):
+def test_decode_with_exif_orientation(tmpdir, codec, orientation):
-    fp = os.path.join(tmpdir, f"exif_oriented_{orientation}.jpg")
+    fp = os.path.join(tmpdir, f"exif_oriented_{orientation}.{codec}")
    t = torch.randint(0, 256, size=(3, 256, 257), dtype=torch.uint8)
    im = F.to_pil_image(t)
    exif = im.getexif()
    exif[0x0112] = orientation  # set exif orientation
-    im.save(fp, "JPEG", exif=exif.tobytes())
+    im.save(fp, codec.upper(), exif=exif.tobytes())
    data = read_file(fp)
    output = decode_image(data, apply_exif_orientation=True)

--- a/torchvision/csrc/io/image/cpu/decode_image.cpp
+++ b/torchvision/csrc/io/image/cpu/decode_image.cpp
@@ -27,7 +27,8 @@ torch::Tensor decode_image(
  if (memcmp(jpeg_signature, datap, 3) == 0) {
    return decode_jpeg(data, mode, apply_exif_orientation);
  } else if (memcmp(png_signature, datap, 4) == 0) {
-    return decode_png(data, mode);
+    return decode_png(
+        data, mode, /*allow_16_bits=*/false, apply_exif_orientation);
  } else {
    TORCH_CHECK(
        false,

--- a/torchvision/csrc/io/image/cpu/decode_jpeg.cpp
+++ b/torchvision/csrc/io/image/cpu/decode_jpeg.cpp
@@ -203,7 +203,7 @@ torch::Tensor decode_jpeg(
  int exif_orientation = -1;
  if (apply_exif_orientation) {
-    exif_orientation = fetch_exif_orientation(&cinfo);
+    exif_orientation = fetch_jpeg_exif_orientation(&cinfo);
  }
  jpeg_start_decompress(&cinfo);

--- a/torchvision/csrc/io/image/cpu/decode_png.cpp
+++ b/torchvision/csrc/io/image/cpu/decode_png.cpp
 #include "decode_png.h"
 #include "common_png.h"
+#include "exif.h"
 namespace vision {
 namespace image {
+using namespace exif_private;
 #if !PNG_FOUND
 torch::Tensor decode_png(
    const torch::Tensor& data,
    ImageReadMode mode,
-    bool allow_16_bits) {
+    bool allow_16_bits,
+    bool apply_exif_orientation) {
  TORCH_CHECK(
      false, "decode_png: torchvision not compiled with libPNG support");
 }
@@ -22,7 +26,8 @@ bool is_little_endian() {
 torch::Tensor decode_png(
    const torch::Tensor& data,
    ImageReadMode mode,
-    bool allow_16_bits) {
+    bool allow_16_bits,
+    bool apply_exif_orientation) {
  C10_LOG_API_USAGE_ONCE("torchvision.csrc.io.image.cpu.decode_png.decode_png");
  // Check that the input tensor dtype is uint8
  TORCH_CHECK(data.dtype() == torch::kU8, "Expected a torch.uint8 tensor");
@@ -234,8 +239,19 @@ torch::Tensor decode_png(
      t_ptr = tensor.accessor<int32_t, 3>().data();
    }
  }
+  int exif_orientation = -1;
+  if (apply_exif_orientation) {
+    exif_orientation = fetch_png_exif_orientation(png_ptr, info_ptr);
+  }
  png_destroy_read_struct(&png_ptr, &info_ptr, nullptr);
-  return tensor.permute({2, 0, 1});
+  auto output = tensor.permute({2, 0, 1});
+  if (apply_exif_orientation) {
+    return exif_orientation_transform(output, exif_orientation);
+  }
+  return output;
 }
 #endif

--- a/torchvision/csrc/io/image/cpu/decode_png.h
+++ b/torchvision/csrc/io/image/cpu/decode_png.h
@@ -9,7 +9,8 @@ namespace image {
 C10_EXPORT torch::Tensor decode_png(
    const torch::Tensor& data,
    ImageReadMode mode = IMAGE_READ_MODE_UNCHANGED,
-    bool allow_16_bits = false);
+    bool allow_16_bits = false,
+    bool apply_exif_orientation = false);
 } // namespace image
 } // namespace vision
--- a/torchvision/csrc/io/image/cpu/exif.h
+++ b/torchvision/csrc/io/image/cpu/exif.h
@@ -51,8 +51,12 @@ direct,
 // https://github.com/opencv/opencv/blob/097891e311fae1d8354eb092a0fd0171e630d78c/modules/imgcodecs/src/exif.cpp
 #if JPEG_FOUND
 #include <jpeglib.h>
+#endif
+#if PNG_FOUND
+#include <png.h>
+#endif
 #include <torch/types.h>
 namespace vision {
@@ -126,8 +130,48 @@ inline uint32_t get_uint32(
      (exif_data[offset + 2] << 8) + exif_data[offset + 3];
 }
-inline int fetch_exif_orientation(j_decompress_ptr cinfo) {
+inline int fetch_exif_orientation(unsigned char* exif_data_ptr, size_t size) {
  int exif_orientation = -1;
+  // Exif binary structure looks like this
+  // First 6 bytes: [E, x, i, f, 0, 0]
+  // Endianness, 2 bytes : [M, M] or [I, I]
+  // Tag mark, 2 bytes: [0, 0x2a]
+  // Offset, 4 bytes
+  // Num entries, 2 bytes
+  // Tag entries and data, tag has 2 bytes and its data has 10 bytes
+  // For more details:
+  // http://www.media.mit.edu/pia/Research/deepview/exif.html
+  ExifDataReader exif_data(exif_data_ptr, size);
+  auto endianness = get_endianness(exif_data);
+  // Checking whether Tag Mark (0x002A) correspond to one contained in the
+  // Jpeg file
+  uint16_t tag_mark = get_uint16(exif_data, endianness, 2);
+  if (tag_mark == REQ_EXIF_TAG_MARK) {
+    auto offset = get_uint32(exif_data, endianness, 4);
+    size_t num_entry = get_uint16(exif_data, endianness, offset);
+    offset += 2; // go to start of tag fields
+    constexpr size_t tiff_field_size = 12;
+    for (size_t entry = 0; entry < num_entry; entry++) {
+      // Here we just search for orientation tag and parse it
+      auto tag_num = get_uint16(exif_data, endianness, offset);
+      if (tag_num == INCORRECT_TAG) {
+        break;
+      }
+      if (tag_num == ORIENTATION_EXIF_TAG) {
+        exif_orientation = get_uint16(exif_data, endianness, offset + 8);
+        break;
+      }
+      offset += tiff_field_size;
+    }
+  }
+  return exif_orientation;
+}
+#if JPEG_FOUND
+inline int fetch_jpeg_exif_orientation(j_decompress_ptr cinfo) {
  // Check for Exif marker APP1
  jpeg_saved_marker_ptr exif_marker = 0;
  jpeg_saved_marker_ptr cmarker = cinfo->marker_list;
@@ -138,51 +182,45 @@ inline int fetch_exif_orientation(j_decompress_ptr cinfo) {
    cmarker = cmarker->next;
  }
-  if (exif_marker) {
+  if (!exif_marker) {
-    // Exif binary structure looks like this
+    return -1;
-    // First 6 bytes: [E, x, i, f, 0, 0]
-    // Endianness, 2 bytes : [M, M] or [I, I]
-    // Tag mark, 2 bytes: [0, 0x2a]
-    // Offset, 4 bytes
-    // Num entries, 2 bytes
-    // Tag entries and data, tag has 2 bytes and its data has 10 bytes
-    // For more details:
-    // http://www.media.mit.edu/pia/Research/deepview/exif.html
-    // Bytes from Exif size field to the first TIFF header
-    constexpr size_t start_offset = 6;
-    if (exif_marker->data_length > start_offset) {
-      auto* exif_data_ptr = exif_marker->data + start_offset;
-      auto size = exif_marker->data_length - start_offset;
-      ExifDataReader exif_data(exif_data_ptr, size);
-      auto endianness = get_endianness(exif_data);
-      // Checking whether Tag Mark (0x002A) correspond to one contained in the
-      // Jpeg file
-      uint16_t tag_mark = get_uint16(exif_data, endianness, 2);
-      if (tag_mark == REQ_EXIF_TAG_MARK) {
-        auto offset = get_uint32(exif_data, endianness, 4);
-        size_t num_entry = get_uint16(exif_data, endianness, offset);
-        offset += 2; // go to start of tag fields
-        constexpr size_t tiff_field_size = 12;
-        for (size_t entry = 0; entry < num_entry; entry++) {
-          // Here we just search for orientation tag and parse it
-          auto tag_num = get_uint16(exif_data, endianness, offset);
-          if (tag_num == INCORRECT_TAG) {
-            break;
-          }
-          if (tag_num == ORIENTATION_EXIF_TAG) {
-            exif_orientation = get_uint16(exif_data, endianness, offset + 8);
-            break;
-          }
-          offset += tiff_field_size;
-        }
-      }
-    }
  }
-  return exif_orientation;
+  constexpr size_t start_offset = 6;
+  if (exif_marker->data_length <= start_offset) {
+    return -1;
+  }
+  auto* exif_data_ptr = exif_marker->data + start_offset;
+  auto size = exif_marker->data_length - start_offset;
+  return fetch_exif_orientation(exif_data_ptr, size);
+}
+#else // #if JPEG_FOUND
+inline int fetch_jpeg_exif_orientation(j_decompress_ptr cinfo) {
+  return -1;
+}
+#endif // #if JPEG_FOUND
+#if PNG_FOUND && defined(PNG_eXIf_SUPPORTED)
+inline int fetch_png_exif_orientation(png_structp png_ptr, png_infop info_ptr) {
+  png_uint_32 num_exif = 0;
+  png_bytep exif = 0;
+  // Exif info could be in info_ptr
+  if (png_get_valid(png_ptr, info_ptr, PNG_INFO_eXIf)) {
+    png_get_eXIf_1(png_ptr, info_ptr, &num_exif, &exif);
+  }
+  if (exif && num_exif > 0) {
+    return fetch_exif_orientation(exif, num_exif);
+  }
+}
+#else // #if PNG_FOUND && defined(PNG_eXIf_SUPPORTED)
+inline int fetch_png_exif_orientation(png_structp png_ptr, png_infop info_ptr) {
+  return -1;
 }
+#endif // #if PNG_FOUND && defined(PNG_eXIf_SUPPORTED)
 constexpr uint16_t IMAGE_ORIENTATION_TL = 1; // normal orientation
 constexpr uint16_t IMAGE_ORIENTATION_TR = 2; // needs horizontal flip
@@ -223,5 +261,3 @@ inline torch::Tensor exif_orientation_transform(
 } // namespace exif_private
 } // namespace image
 } // namespace vision
-#endif
--- a/torchvision/csrc/io/image/image.cpp
+++ b/torchvision/csrc/io/image/image.cpp
@@ -21,7 +21,8 @@ namespace image {
 static auto registry =
    torch::RegisterOperators()
-        .op("image::decode_png", &decode_png)
+        .op("image::decode_png(Tensor data, int mode, bool allow_16_bits = False, bool apply_exif_orientation=False) -> Tensor",
+            &decode_png)
        .op("image::encode_png", &encode_png)
        .op("image::decode_jpeg(Tensor data, int mode, bool apply_exif_orientation=False) -> Tensor",
            &decode_jpeg)

--- a/torchvision/io/image.py
+++ b/torchvision/io/image.py
@@ -67,7 +67,9 @@ def write_file(filename: str, data: torch.Tensor) -> None:
    torch.ops.image.write_file(filename, data)
-def decode_png(input: torch.Tensor, mode: ImageReadMode = ImageReadMode.UNCHANGED) -> torch.Tensor:
+def decode_png(
+    input: torch.Tensor, mode: ImageReadMode = ImageReadMode.UNCHANGED, apply_exif_orientation: bool = False
+) -> torch.Tensor:
    """
    Decodes a PNG image into a 3 dimensional RGB or grayscale Tensor.
    Optionally converts the image to the desired format.
@@ -80,13 +82,15 @@ def decode_png(input: torch.Tensor, mode: ImageReadMode = ImageReadMode.UNCHANGE
            converting the image. Default: ``ImageReadMode.UNCHANGED``.
            See `ImageReadMode` class for more information on various
            available modes.
+        apply_exif_orientation (bool): apply EXIF orientation transformation to the output tensor.
+            Default: False.
    Returns:
        output (Tensor[image_channels, image_height, image_width])
    """
    if not torch.jit.is_scripting() and not torch.jit.is_tracing():
        _log_api_usage_once(decode_png)
-    output = torch.ops.image.decode_png(input, mode.value, False)
+    output = torch.ops.image.decode_png(input, mode.value, False, apply_exif_orientation)
    return output
@@ -235,7 +239,7 @@ def decode_image(
            See ``ImageReadMode`` class for more information on various
            available modes.
        apply_exif_orientation (bool): apply EXIF orientation transformation to the output tensor.
-            Default: False. Only implemented for JPEG format
+            Default: False.
    Returns:
        output (Tensor[image_channels, image_height, image_width])
@@ -261,7 +265,7 @@ def read_image(
            See ``ImageReadMode`` class for more information on various
            available modes.
        apply_exif_orientation (bool): apply EXIF orientation transformation to the output tensor.
-            Default: False. Only implemented for JPEG format
+            Default: False.
    Returns:
        output (Tensor[image_channels, image_height, image_width])