Layoutlmv2 tesseractconfig (#17733)

* Added option for users to modify config parameter used by pytesseract during feature extraction - Added optional 'tess_config' kwarg when setting up LayoutLMV2 processor that is used by pytesseract during feature extraction - Eg. Can be used to modify psm values by setting tess_config to '--psm 7' - Different psm values significantly influences the output of layoutlmv2 * Update src/transformers/models/layoutlmv2/feature_extraction_layoutlmv2.py Co-authored-by: NielsRogge <48327001+NielsRogge@users.noreply.github.com> * Update src/transformers/models/layoutlmv2/feature_extraction_layoutlmv2.py Co-authored-by: NielsRogge <48327001+NielsRogge@users.noreply.github.com> * Updated variable names to be more explicit * Fixed styles * Added option for users to modify config parameter when calling pytesseract during feature extraction - Added option to set "tesseract_config" parameter during LayoutLMV3 processor initialization - Can be used to modify PSM values, eg. by setting tesseract_config="--psm 6" * Removed from function signature Co-authored-by: NielsRogge <48327001+NielsRogge@users.noreply.github.com>

Layoutlmv2 tesseractconfig (#17733)
* Added option for users to modify config parameter used by pytesseract during feature extraction - Added optional 'tess_config' kwarg when setting up LayoutLMV2 processor that is used by pytesseract during feature extraction - Eg. Can be used to modify psm values by setting tess_config to '--psm 7' - Different psm values significantly influences the output of layoutlmv2 * Update src/transformers/models/layoutlmv2/feature_extraction_layoutlmv2.py Co-authored-by: NielsRogge <48327001+NielsRogge@users.noreply.github.com> * Update src/transformers/models/layoutlmv2/feature_extraction_layoutlmv2.py Co-authored-by: NielsRogge <48327001+NielsRogge@users.noreply.github.com> * Updated variable names to be more explicit * Fixed styles * Added option for users to modify config parameter when calling pytesseract during feature extraction - Added option to set "tesseract_config" parameter during LayoutLMV3 processor initialization - Can be used to modify PSM values, eg. by setting tesseract_config="--psm 6" * Removed from function signature Co-authored-by: NielsRogge <48327001+NielsRogge@users.noreply.github.com>
24845aeb · Kelvin Kong · GitHub · 151a2aaa · 24845aeb · 24845aeb
Unverified Commit 24845aeb authored Aug 02, 2022 by Kelvin Kong Committed by GitHub Aug 01, 2022
2 changed files
--- a/src/transformers/models/layoutlmv2/feature_extraction_layoutlmv2.py
+++ b/src/transformers/models/layoutlmv2/feature_extraction_layoutlmv2.py
@@ -46,11 +46,11 @@ def normalize_box(box, width, height):
    ]
-def apply_tesseract(image: Image.Image, lang: Optional[str]):
+def apply_tesseract(image: Image.Image, lang: Optional[str], tesseract_config: Optional[str]):
    """Applies Tesseract OCR on a document image, and returns recognized words + normalized bounding boxes."""
    # apply OCR
-    data = pytesseract.image_to_data(image, lang=lang, output_type="dict")
+    data = pytesseract.image_to_data(image, lang=lang, output_type="dict", config=tesseract_config)
    words, left, top, width, height = data["text"], data["left"], data["top"], data["width"], data["height"]
    # filter empty words and corresponding coordinates
@@ -100,9 +100,12 @@ class LayoutLMv2FeatureExtractor(FeatureExtractionMixin, ImageFeatureExtractionM
            if `do_resize` is set to `True`.
        apply_ocr (`bool`, *optional*, defaults to `True`):
            Whether to apply the Tesseract OCR engine to get words + normalized bounding boxes.
-        ocr_lang (`Optional[str]`, *optional*):
+        ocr_lang (`str`, *optional*):
            The language, specified by its ISO code, to be used by the Tesseract OCR engine. By default, English is
            used.
+        tesseract_config (`str`, *optional*):
+            Any additional custom configuration flags that are forwarded to the `config` parameter when calling
+            Tesseract. For example: '--psm 6'.
            <Tip>
@@ -112,13 +115,23 @@ class LayoutLMv2FeatureExtractor(FeatureExtractionMixin, ImageFeatureExtractionM
    model_input_names = ["pixel_values"]
-    def __init__(self, do_resize=True, size=224, resample=Image.BILINEAR, apply_ocr=True, ocr_lang=None, **kwargs):
+    def __init__(
+        self,
+        do_resize=True,
+        size=224,
+        resample=Image.BILINEAR,
+        apply_ocr=True,
+        ocr_lang=None,
+        tesseract_config="",
+        **kwargs
+    ):
        super().__init__(**kwargs)
        self.do_resize = do_resize
        self.size = size
        self.resample = resample
        self.apply_ocr = apply_ocr
        self.ocr_lang = ocr_lang
+        self.tesseract_config = tesseract_config
    def __call__(
        self, images: ImageInput, return_tensors: Optional[Union[str, TensorType]] = None, **kwargs
@@ -201,7 +214,7 @@ class LayoutLMv2FeatureExtractor(FeatureExtractionMixin, ImageFeatureExtractionM
            words_batch = []
            boxes_batch = []
            for image in images:
-                words, boxes = apply_tesseract(self.to_pil_image(image), self.ocr_lang)
+                words, boxes = apply_tesseract(self.to_pil_image(image), self.ocr_lang, self.tesseract_config)
                words_batch.append(words)
                boxes_batch.append(boxes)

--- a/src/transformers/models/layoutlmv3/feature_extraction_layoutlmv3.py
+++ b/src/transformers/models/layoutlmv3/feature_extraction_layoutlmv3.py
@@ -46,11 +46,10 @@ def normalize_box(box, width, height):
    ]
-def apply_tesseract(image: Image.Image, lang: Optional[str]):
+def apply_tesseract(image: Image.Image, lang: Optional[str], tesseract_config: Optional[str]):
    """Applies Tesseract OCR on a document image, and returns recognized words + normalized bounding boxes."""
    # apply OCR
-    data = pytesseract.image_to_data(image, lang=lang, output_type="dict")
+    data = pytesseract.image_to_data(image, lang=lang, output_type="dict", config=tesseract_config)
    words, left, top, width, height = data["text"], data["left"], data["top"], data["width"], data["height"]
    # filter empty words and corresponding coordinates
@@ -106,9 +105,12 @@ class LayoutLMv3FeatureExtractor(FeatureExtractionMixin, ImageFeatureExtractionM
            The sequence of standard deviations for each channel, to be used when normalizing images.
        apply_ocr (`bool`, *optional*, defaults to `True`):
            Whether to apply the Tesseract OCR engine to get words + normalized bounding boxes.
-        ocr_lang (`Optional[str]`, *optional*):
+        ocr_lang (`str`, *optional*):
            The language, specified by its ISO code, to be used by the Tesseract OCR engine. By default, English is
            used.
+        tesseract_config (`str`, *optional*):
+            Any additional custom configuration flags that are forwarded to the `config` parameter when calling
+            Tesseract. For example: '--psm 6'.
            <Tip>
@@ -128,6 +130,7 @@ class LayoutLMv3FeatureExtractor(FeatureExtractionMixin, ImageFeatureExtractionM
        image_std=None,
        apply_ocr=True,
        ocr_lang=None,
+        tesseract_config="",
        **kwargs
    ):
        super().__init__(**kwargs)
@@ -139,6 +142,7 @@ class LayoutLMv3FeatureExtractor(FeatureExtractionMixin, ImageFeatureExtractionM
        self.image_std = image_std if image_std is not None else IMAGENET_STANDARD_STD
        self.apply_ocr = apply_ocr
        self.ocr_lang = ocr_lang
+        self.tesseract_config = tesseract_config
    def __call__(
        self, images: ImageInput, return_tensors: Optional[Union[str, TensorType]] = None, **kwargs
@@ -221,7 +225,7 @@ class LayoutLMv3FeatureExtractor(FeatureExtractionMixin, ImageFeatureExtractionM
            words_batch = []
            boxes_batch = []
            for image in images:
-                words, boxes = apply_tesseract(self.to_pil_image(image), self.ocr_lang)
+                words, boxes = apply_tesseract(self.to_pil_image(image), self.ocr_lang, self.tesseract_config)
                words_batch.append(words)
                boxes_batch.append(boxes)