"git@developer.sourcefind.cn:chenpangpang/transformers.git" did not exist on "9beaa85b071078f84037f6a036ea042f551a8623"
Unverified Commit 4ee0b755 authored by Štěpán Műller's avatar Štěpán Műller Committed by GitHub
Browse files

LayoutLMv2FeatureExtractor now supports non-English languages when applying Tesseract OCR. (#14514)



* Added the lang argument to apply_tesseract in feature_extraction_layoutlmv2.py, which is used in pytesseract.image_to_data.

* Added ocr_lang argument to LayoutLMv2FeatureExtractor.__init__, which is used when calling apply_tesseract

* Updated the documentation of the LayoutLMv2FeatureExtractor

* Specified in the documentation of the LayoutLMv2FeatureExtractor that the ocr_lang argument should be a language code.

* Update src/transformers/models/layoutlmv2/feature_extraction_layoutlmv2.py
Co-authored-by: default avatarNielsRogge <48327001+NielsRogge@users.noreply.github.com>

* Split comment into two lines to adhere to the max line size limit.

* Update src/transformers/models/layoutlmv2/feature_extraction_layoutlmv2.py
Co-authored-by: default avatarNielsRogge <48327001+NielsRogge@users.noreply.github.com>
Co-authored-by: default avatarNielsRogge <48327001+NielsRogge@users.noreply.github.com>
parent ebbe8cc3
...@@ -47,11 +47,11 @@ def normalize_box(box, width, height): ...@@ -47,11 +47,11 @@ def normalize_box(box, width, height):
] ]
def apply_tesseract(image: Image.Image): def apply_tesseract(image: Image.Image, lang: Optional[str]):
"""Applies Tesseract OCR on a document image, and returns recognized words + normalized bounding boxes.""" """Applies Tesseract OCR on a document image, and returns recognized words + normalized bounding boxes."""
# apply OCR # apply OCR
data = pytesseract.image_to_data(image, output_type="dict") data = pytesseract.image_to_data(image, lang=lang, output_type="dict")
words, left, top, width, height = data["text"], data["left"], data["top"], data["width"], data["height"] words, left, top, width, height = data["text"], data["left"], data["top"], data["width"], data["height"]
# filter empty words and corresponding coordinates # filter empty words and corresponding coordinates
...@@ -102,6 +102,9 @@ class LayoutLMv2FeatureExtractor(FeatureExtractionMixin, ImageFeatureExtractionM ...@@ -102,6 +102,9 @@ class LayoutLMv2FeatureExtractor(FeatureExtractionMixin, ImageFeatureExtractionM
Only has an effect if :obj:`do_resize` is set to :obj:`True`. Only has an effect if :obj:`do_resize` is set to :obj:`True`.
apply_ocr (:obj:`bool`, `optional`, defaults to :obj:`True`): apply_ocr (:obj:`bool`, `optional`, defaults to :obj:`True`):
Whether to apply the Tesseract OCR engine to get words + normalized bounding boxes. Whether to apply the Tesseract OCR engine to get words + normalized bounding boxes.
ocr_lang (:obj:`Optional[str]`, `optional`):
The language, specified by its ISO code, to be used by the Tesseract OCR engine. By default, English is
used.
.. note:: .. note::
...@@ -110,12 +113,13 @@ class LayoutLMv2FeatureExtractor(FeatureExtractionMixin, ImageFeatureExtractionM ...@@ -110,12 +113,13 @@ class LayoutLMv2FeatureExtractor(FeatureExtractionMixin, ImageFeatureExtractionM
model_input_names = ["pixel_values"] model_input_names = ["pixel_values"]
def __init__(self, do_resize=True, size=224, resample=Image.BILINEAR, apply_ocr=True, **kwargs): def __init__(self, do_resize=True, size=224, resample=Image.BILINEAR, apply_ocr=True, ocr_lang=None, **kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)
self.do_resize = do_resize self.do_resize = do_resize
self.size = size self.size = size
self.resample = resample self.resample = resample
self.apply_ocr = apply_ocr self.apply_ocr = apply_ocr
self.ocr_lang = ocr_lang
if apply_ocr: if apply_ocr:
requires_backends(self, "pytesseract") requires_backends(self, "pytesseract")
...@@ -199,7 +203,7 @@ class LayoutLMv2FeatureExtractor(FeatureExtractionMixin, ImageFeatureExtractionM ...@@ -199,7 +203,7 @@ class LayoutLMv2FeatureExtractor(FeatureExtractionMixin, ImageFeatureExtractionM
words_batch = [] words_batch = []
boxes_batch = [] boxes_batch = []
for image in images: for image in images:
words, boxes = apply_tesseract(self.to_pil_image(image)) words, boxes = apply_tesseract(self.to_pil_image(image), self.ocr_lang)
words_batch.append(words) words_batch.append(words)
boxes_batch.append(boxes) boxes_batch.append(boxes)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment