Unverified Commit 24845aeb authored by Kelvin Kong's avatar Kelvin Kong Committed by GitHub
Browse files

Layoutlmv2 tesseractconfig (#17733)



* Added option for users to modify config parameter used by pytesseract during feature extraction

- Added optional 'tess_config' kwarg when setting up LayoutLMV2 processor that is used by pytesseract during feature extraction
- Eg. Can be used to modify psm values by setting tess_config to '--psm 7'
- Different psm values significantly influences the output of layoutlmv2

* Update src/transformers/models/layoutlmv2/feature_extraction_layoutlmv2.py
Co-authored-by: default avatarNielsRogge <48327001+NielsRogge@users.noreply.github.com>

* Update src/transformers/models/layoutlmv2/feature_extraction_layoutlmv2.py
Co-authored-by: default avatarNielsRogge <48327001+NielsRogge@users.noreply.github.com>

* Updated variable names to be more explicit

* Fixed styles

* Added option for users to modify config parameter when calling pytesseract during feature extraction

- Added option to set "tesseract_config" parameter during LayoutLMV3 processor initialization
- Can be used to modify PSM values, eg. by setting tesseract_config="--psm 6"

* Removed  from function signature
Co-authored-by: default avatarNielsRogge <48327001+NielsRogge@users.noreply.github.com>
parent 151a2aaa
...@@ -46,11 +46,11 @@ def normalize_box(box, width, height): ...@@ -46,11 +46,11 @@ def normalize_box(box, width, height):
] ]
def apply_tesseract(image: Image.Image, lang: Optional[str]): def apply_tesseract(image: Image.Image, lang: Optional[str], tesseract_config: Optional[str]):
"""Applies Tesseract OCR on a document image, and returns recognized words + normalized bounding boxes.""" """Applies Tesseract OCR on a document image, and returns recognized words + normalized bounding boxes."""
# apply OCR # apply OCR
data = pytesseract.image_to_data(image, lang=lang, output_type="dict") data = pytesseract.image_to_data(image, lang=lang, output_type="dict", config=tesseract_config)
words, left, top, width, height = data["text"], data["left"], data["top"], data["width"], data["height"] words, left, top, width, height = data["text"], data["left"], data["top"], data["width"], data["height"]
# filter empty words and corresponding coordinates # filter empty words and corresponding coordinates
...@@ -100,9 +100,12 @@ class LayoutLMv2FeatureExtractor(FeatureExtractionMixin, ImageFeatureExtractionM ...@@ -100,9 +100,12 @@ class LayoutLMv2FeatureExtractor(FeatureExtractionMixin, ImageFeatureExtractionM
if `do_resize` is set to `True`. if `do_resize` is set to `True`.
apply_ocr (`bool`, *optional*, defaults to `True`): apply_ocr (`bool`, *optional*, defaults to `True`):
Whether to apply the Tesseract OCR engine to get words + normalized bounding boxes. Whether to apply the Tesseract OCR engine to get words + normalized bounding boxes.
ocr_lang (`Optional[str]`, *optional*): ocr_lang (`str`, *optional*):
The language, specified by its ISO code, to be used by the Tesseract OCR engine. By default, English is The language, specified by its ISO code, to be used by the Tesseract OCR engine. By default, English is
used. used.
tesseract_config (`str`, *optional*):
Any additional custom configuration flags that are forwarded to the `config` parameter when calling
Tesseract. For example: '--psm 6'.
<Tip> <Tip>
...@@ -112,13 +115,23 @@ class LayoutLMv2FeatureExtractor(FeatureExtractionMixin, ImageFeatureExtractionM ...@@ -112,13 +115,23 @@ class LayoutLMv2FeatureExtractor(FeatureExtractionMixin, ImageFeatureExtractionM
model_input_names = ["pixel_values"] model_input_names = ["pixel_values"]
def __init__(self, do_resize=True, size=224, resample=Image.BILINEAR, apply_ocr=True, ocr_lang=None, **kwargs): def __init__(
self,
do_resize=True,
size=224,
resample=Image.BILINEAR,
apply_ocr=True,
ocr_lang=None,
tesseract_config="",
**kwargs
):
super().__init__(**kwargs) super().__init__(**kwargs)
self.do_resize = do_resize self.do_resize = do_resize
self.size = size self.size = size
self.resample = resample self.resample = resample
self.apply_ocr = apply_ocr self.apply_ocr = apply_ocr
self.ocr_lang = ocr_lang self.ocr_lang = ocr_lang
self.tesseract_config = tesseract_config
def __call__( def __call__(
self, images: ImageInput, return_tensors: Optional[Union[str, TensorType]] = None, **kwargs self, images: ImageInput, return_tensors: Optional[Union[str, TensorType]] = None, **kwargs
...@@ -201,7 +214,7 @@ class LayoutLMv2FeatureExtractor(FeatureExtractionMixin, ImageFeatureExtractionM ...@@ -201,7 +214,7 @@ class LayoutLMv2FeatureExtractor(FeatureExtractionMixin, ImageFeatureExtractionM
words_batch = [] words_batch = []
boxes_batch = [] boxes_batch = []
for image in images: for image in images:
words, boxes = apply_tesseract(self.to_pil_image(image), self.ocr_lang) words, boxes = apply_tesseract(self.to_pil_image(image), self.ocr_lang, self.tesseract_config)
words_batch.append(words) words_batch.append(words)
boxes_batch.append(boxes) boxes_batch.append(boxes)
......
...@@ -46,11 +46,10 @@ def normalize_box(box, width, height): ...@@ -46,11 +46,10 @@ def normalize_box(box, width, height):
] ]
def apply_tesseract(image: Image.Image, lang: Optional[str]): def apply_tesseract(image: Image.Image, lang: Optional[str], tesseract_config: Optional[str]):
"""Applies Tesseract OCR on a document image, and returns recognized words + normalized bounding boxes.""" """Applies Tesseract OCR on a document image, and returns recognized words + normalized bounding boxes."""
# apply OCR # apply OCR
data = pytesseract.image_to_data(image, lang=lang, output_type="dict") data = pytesseract.image_to_data(image, lang=lang, output_type="dict", config=tesseract_config)
words, left, top, width, height = data["text"], data["left"], data["top"], data["width"], data["height"] words, left, top, width, height = data["text"], data["left"], data["top"], data["width"], data["height"]
# filter empty words and corresponding coordinates # filter empty words and corresponding coordinates
...@@ -106,9 +105,12 @@ class LayoutLMv3FeatureExtractor(FeatureExtractionMixin, ImageFeatureExtractionM ...@@ -106,9 +105,12 @@ class LayoutLMv3FeatureExtractor(FeatureExtractionMixin, ImageFeatureExtractionM
The sequence of standard deviations for each channel, to be used when normalizing images. The sequence of standard deviations for each channel, to be used when normalizing images.
apply_ocr (`bool`, *optional*, defaults to `True`): apply_ocr (`bool`, *optional*, defaults to `True`):
Whether to apply the Tesseract OCR engine to get words + normalized bounding boxes. Whether to apply the Tesseract OCR engine to get words + normalized bounding boxes.
ocr_lang (`Optional[str]`, *optional*): ocr_lang (`str`, *optional*):
The language, specified by its ISO code, to be used by the Tesseract OCR engine. By default, English is The language, specified by its ISO code, to be used by the Tesseract OCR engine. By default, English is
used. used.
tesseract_config (`str`, *optional*):
Any additional custom configuration flags that are forwarded to the `config` parameter when calling
Tesseract. For example: '--psm 6'.
<Tip> <Tip>
...@@ -128,6 +130,7 @@ class LayoutLMv3FeatureExtractor(FeatureExtractionMixin, ImageFeatureExtractionM ...@@ -128,6 +130,7 @@ class LayoutLMv3FeatureExtractor(FeatureExtractionMixin, ImageFeatureExtractionM
image_std=None, image_std=None,
apply_ocr=True, apply_ocr=True,
ocr_lang=None, ocr_lang=None,
tesseract_config="",
**kwargs **kwargs
): ):
super().__init__(**kwargs) super().__init__(**kwargs)
...@@ -139,6 +142,7 @@ class LayoutLMv3FeatureExtractor(FeatureExtractionMixin, ImageFeatureExtractionM ...@@ -139,6 +142,7 @@ class LayoutLMv3FeatureExtractor(FeatureExtractionMixin, ImageFeatureExtractionM
self.image_std = image_std if image_std is not None else IMAGENET_STANDARD_STD self.image_std = image_std if image_std is not None else IMAGENET_STANDARD_STD
self.apply_ocr = apply_ocr self.apply_ocr = apply_ocr
self.ocr_lang = ocr_lang self.ocr_lang = ocr_lang
self.tesseract_config = tesseract_config
def __call__( def __call__(
self, images: ImageInput, return_tensors: Optional[Union[str, TensorType]] = None, **kwargs self, images: ImageInput, return_tensors: Optional[Union[str, TensorType]] = None, **kwargs
...@@ -221,7 +225,7 @@ class LayoutLMv3FeatureExtractor(FeatureExtractionMixin, ImageFeatureExtractionM ...@@ -221,7 +225,7 @@ class LayoutLMv3FeatureExtractor(FeatureExtractionMixin, ImageFeatureExtractionM
words_batch = [] words_batch = []
boxes_batch = [] boxes_batch = []
for image in images: for image in images:
words, boxes = apply_tesseract(self.to_pil_image(image), self.ocr_lang) words, boxes = apply_tesseract(self.to_pil_image(image), self.ocr_lang, self.tesseract_config)
words_batch.append(words) words_batch.append(words)
boxes_batch.append(boxes) boxes_batch.append(boxes)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment