"doc/vscode:/vscode.git/clone" did not exist on "103ef50106a2a66d99aea7682ab43a7b0a35bf5a"
Unverified Commit cf613b13 authored by Bin Lu's avatar Bin Lu Committed by GitHub
Browse files

Merge branch 'PaddlePaddle:dygraph' into dygraph

parents 8fe6209d 732fa778
......@@ -11,64 +11,45 @@
# CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
# THE SOFTWARE.
#!/usr/bin/env python
# !/usr/bin/env python
# -*- coding: utf-8 -*-
# pyrcc5 -o libs/resources.py resources.qrc
import argparse
import ast
import codecs
import json
import os.path
import platform
import subprocess
import sys
from functools import partial
from collections import defaultdict
import json
import cv2
try:
from PyQt5 import QtCore, QtGui, QtWidgets
from PyQt5.QtGui import *
from PyQt5.QtCore import *
from PyQt5.QtWidgets import *
except ImportError:
print("Please install pyqt5...")
__dir__ = os.path.dirname(os.path.abspath(__file__))
import numpy as np
sys.path.append(__dir__)
sys.path.append(os.path.abspath(os.path.join(__dir__, '../..')))
sys.path.append(os.path.abspath(os.path.join(__dir__, '../PaddleOCR')))
sys.path.append("..")
from paddleocr import PaddleOCR
try:
from PyQt5 import QtCore, QtGui, QtWidgets
from PyQt5.QtGui import *
from PyQt5.QtCore import *
from PyQt5.QtWidgets import *
except ImportError:
# needed for py3+qt4
# Ref:
# http://pyqt.sourceforge.net/Docs/PyQt4/incompatible_apis.html
# http://stackoverflow.com/questions/21217399/pyqt4-qtcore-qvariant-object-instead-of-a-string
if sys.version_info.major >= 3:
import sip
sip.setapi('QVariant', 2)
from PyQt4.QtGui import *
from PyQt4.QtCore import *
from combobox import ComboBox
from libs.constants import *
from libs.utils import *
from libs.settings import Settings
from libs.shape import Shape, DEFAULT_LINE_COLOR, DEFAULT_FILL_COLOR,DEFAULT_LOCK_COLOR
from libs.shape import Shape, DEFAULT_LINE_COLOR, DEFAULT_FILL_COLOR, DEFAULT_LOCK_COLOR
from libs.stringBundle import StringBundle
from libs.canvas import Canvas
from libs.zoomWidget import ZoomWidget
from libs.autoDialog import AutoDialog
from libs.labelDialog import LabelDialog
from libs.colorDialog import ColorDialog
from libs.toolBar import ToolBar
from libs.ustr import ustr
from libs.hashableQListWidgetItem import HashableQListWidgetItem
from libs.editinlist import EditInList
......@@ -76,31 +57,19 @@ from libs.editinlist import EditInList
__appname__ = 'PPOCRLabel'
class WindowMixin(object):
def menu(self, title, actions=None):
menu = self.menuBar().addMenu(title)
if actions:
addActions(menu, actions)
return menu
def toolbar(self, title, actions=None):
toolbar = ToolBar(title)
toolbar.setObjectName(u'%sToolBar' % title)
# toolbar.setOrientation(Qt.Vertical)
toolbar.setToolButtonStyle(Qt.ToolButtonTextUnderIcon)
if actions:
addActions(toolbar, actions)
self.addToolBar(Qt.LeftToolBarArea, toolbar)
return toolbar
class MainWindow(QMainWindow, WindowMixin):
class MainWindow(QMainWindow):
FIT_WINDOW, FIT_WIDTH, MANUAL_ZOOM = list(range(3))
def __init__(self, lang="ch", gpu=False, defaultFilename=None, defaultPrefdefClassFile=None, defaultSaveDir=None):
def __init__(self,
lang="ch",
gpu=False,
default_filename=None,
default_predefined_class_file=None,
default_save_dir=None):
super(MainWindow, self).__init__()
self.setWindowTitle(__appname__)
self.setWindowState(Qt.WindowMaximized) # set window max
self.activateWindow() # PPOCRLabel goes to the front when activate
# Load setting in the main thread
self.settings = Settings()
......@@ -110,11 +79,17 @@ class MainWindow(QMainWindow, WindowMixin):
# Load string bundle for i18n
if lang not in ['ch', 'en']:
lang = 'en'
self.stringBundle = StringBundle.getBundle(localeStr='zh-CN' if lang=='ch' else 'en') # 'en'
self.stringBundle = StringBundle.getBundle(localeStr='zh-CN' if lang == 'ch' else 'en') # 'en'
getStr = lambda strId: self.stringBundle.getString(strId)
self.defaultSaveDir = defaultSaveDir
self.ocr = PaddleOCR(use_pdserving=False, use_angle_cls=True, det=True, cls=True, use_gpu=gpu, lang=lang, show_log=False)
self.defaultSaveDir = default_save_dir
self.ocr = PaddleOCR(use_pdserving=False,
use_angle_cls=True,
det=True,
cls=True,
use_gpu=gpu,
lang=lang,
show_log=False)
if os.path.exists('./data/paddle.png'):
result = self.ocr.ocr('./data/paddle.png', cls=True, det=True)
......@@ -132,7 +107,6 @@ class MainWindow(QMainWindow, WindowMixin):
self.labelFile = None
self.currIndex = 0
# Whether we need to save or not.
self.dirty = False
......@@ -142,7 +116,7 @@ class MainWindow(QMainWindow, WindowMixin):
self.screencast = "https://github.com/PaddlePaddle/PaddleOCR"
# Load predefined classes to the list
self.loadPredefinedClasses(defaultPrefdefClassFile)
self.loadPredefinedClasses(default_predefined_class_file)
# Main widgets and related state.
self.labelDialog = LabelDialog(parent=self, listItem=self.labelHist)
......@@ -158,7 +132,7 @@ class MainWindow(QMainWindow, WindowMixin):
self.PPreader = None
self.autoSaveNum = 5
################# file list ###############
# ================== File List ==================
self.fileListWidget = QListWidget()
self.fileListWidget.itemClicked.connect(self.fileitemDoubleClicked)
self.fileListWidget.setIconSize(QSize(25, 25))
......@@ -178,12 +152,13 @@ class MainWindow(QMainWindow, WindowMixin):
fileListContainer = QWidget()
fileListContainer.setLayout(filelistLayout)
self.filedock = QDockWidget(getStr('fileList'), self)
self.filedock.setObjectName(getStr('files'))
self.filedock.setWidget(fileListContainer)
self.addDockWidget(Qt.LeftDockWidgetArea, self.filedock)
self.fileListName = getStr('fileList')
self.fileDock = QDockWidget(self.fileListName, self)
self.fileDock.setObjectName(getStr('files'))
self.fileDock.setWidget(fileListContainer)
self.addDockWidget(Qt.LeftDockWidgetArea, self.fileDock)
######## Right area ##########
# ================== Right Area ==================
listLayout = QVBoxLayout()
listLayout.setContentsMargins(0, 0, 0, 0)
......@@ -199,7 +174,6 @@ class MainWindow(QMainWindow, WindowMixin):
self.DelButton = QToolButton()
self.DelButton.setToolButtonStyle(Qt.ToolButtonTextBesideIcon)
lefttoptoolbox = QHBoxLayout()
lefttoptoolbox.addWidget(self.newButton)
lefttoptoolbox.addWidget(self.reRecogButton)
......@@ -207,36 +181,37 @@ class MainWindow(QMainWindow, WindowMixin):
lefttoptoolboxcontainer.setLayout(lefttoptoolbox)
listLayout.addWidget(lefttoptoolboxcontainer)
################## label list ####################
# ================== Label List ==================
# Create and add a widget for showing current label items
self.labelList = EditInList()
labelListContainer = QWidget()
labelListContainer.setLayout(listLayout)
#self.labelList.itemActivated.connect(self.labelSelectionChanged)
self.labelList.itemSelectionChanged.connect(self.labelSelectionChanged)
self.labelList.clicked.connect(self.labelList.item_clicked)
# Connect to itemChanged to detect checkbox changes.
self.labelList.itemChanged.connect(self.labelItemChanged)
self.labelListDock = QDockWidget(getStr('recognitionResult'),self)
self.labelListDockName = getStr('recognitionResult')
self.labelListDock = QDockWidget(self.labelListDockName, self)
self.labelListDock.setWidget(self.labelList)
self.labelListDock.setFeatures(QDockWidget.NoDockWidgetFeatures)
listLayout.addWidget(self.labelListDock)
################## detection box ####################
# ================== Detection Box ==================
self.BoxList = QListWidget()
#self.BoxList.itemActivated.connect(self.boxSelectionChanged)
# self.BoxList.itemActivated.connect(self.boxSelectionChanged)
self.BoxList.itemSelectionChanged.connect(self.boxSelectionChanged)
self.BoxList.itemDoubleClicked.connect(self.editBox)
# Connect to itemChanged to detect checkbox changes.
self.BoxList.itemChanged.connect(self.boxItemChanged)
self.BoxListDock = QDockWidget(getStr('detectionBoxposition'), self)
self.BoxListDockName = getStr('detectionBoxposition')
self.BoxListDock = QDockWidget(self.BoxListDockName, self)
self.BoxListDock.setWidget(self.BoxList)
self.BoxListDock.setFeatures(QDockWidget.NoDockWidgetFeatures)
listLayout.addWidget(self.BoxListDock)
############ lower right area ############
# ================== Lower Right Area ==================
leftbtmtoolbox = QHBoxLayout()
leftbtmtoolbox.addWidget(self.SaveButton)
leftbtmtoolbox.addWidget(self.DelButton)
......@@ -248,26 +223,26 @@ class MainWindow(QMainWindow, WindowMixin):
self.dock.setObjectName(getStr('labels'))
self.dock.setWidget(labelListContainer)
# ================== Zoom Bar ==================
self.imageSlider = QSlider(Qt.Horizontal)
self.imageSlider.valueChanged.connect(self.CanvasSizeChange)
self.imageSlider.setMinimum(-9)
self.imageSlider.setMaximum(510)
self.imageSlider.setSingleStep(1)
self.imageSlider.setTickPosition(QSlider.TicksBelow)
self.imageSlider.setTickInterval(1)
########## zoom bar #########
self.imgsplider = QSlider(Qt.Horizontal)
self.imgsplider.valueChanged.connect(self.CanvasSizeChange)
self.imgsplider.setMinimum(-150)
self.imgsplider.setMaximum(150)
self.imgsplider.setSingleStep(1)
self.imgsplider.setTickPosition(QSlider.TicksBelow)
self.imgsplider.setTickInterval(1)
op = QGraphicsOpacityEffect()
op.setOpacity(0.2)
self.imgsplider.setGraphicsEffect(op)
# self.imgsplider.setAttribute(Qt.WA_TranslucentBackground)
self.imgsplider.setStyleSheet("background-color:transparent")
self.imgsliderDock = QDockWidget(getStr('ImageResize'), self)
self.imgsliderDock.setObjectName(getStr('IR'))
self.imgsliderDock.setWidget(self.imgsplider)
self.imgsliderDock.setFeatures(QDockWidget.DockWidgetFloatable)
self.imgsliderDock.setAttribute(Qt.WA_TranslucentBackground)
self.addDockWidget(Qt.RightDockWidgetArea, self.imgsliderDock)
self.imageSlider.setGraphicsEffect(op)
self.imageSlider.setStyleSheet("background-color:transparent")
self.imageSliderDock = QDockWidget(getStr('ImageResize'), self)
self.imageSliderDock.setObjectName(getStr('IR'))
self.imageSliderDock.setWidget(self.imageSlider)
self.imageSliderDock.setFeatures(QDockWidget.DockWidgetFloatable)
self.imageSliderDock.setAttribute(Qt.WA_TranslucentBackground)
self.addDockWidget(Qt.RightDockWidgetArea, self.imageSliderDock)
self.zoomWidget = ZoomWidget()
self.colorDialog = ColorDialog(parent=self)
......@@ -275,13 +250,13 @@ class MainWindow(QMainWindow, WindowMixin):
self.msgBox = QMessageBox()
########## thumbnail #########
# ================== Thumbnail ==================
hlayout = QHBoxLayout()
m = (0, 0, 0, 0)
hlayout.setSpacing(0)
hlayout.setContentsMargins(*m)
self.preButton = QToolButton()
self.preButton.setIcon(newIcon("prev",40))
self.preButton.setIcon(newIcon("prev", 40))
self.preButton.setIconSize(QSize(40, 100))
self.preButton.clicked.connect(self.openPrevImg)
self.preButton.setStyleSheet('border: none;')
......@@ -291,10 +266,10 @@ class MainWindow(QMainWindow, WindowMixin):
self.iconlist.setFlow(QListView.TopToBottom)
self.iconlist.setSpacing(10)
self.iconlist.setIconSize(QSize(50, 50))
self.iconlist.setMovement(False)
self.iconlist.setMovement(QListView.Static)
self.iconlist.setResizeMode(QListView.Adjust)
self.iconlist.itemClicked.connect(self.iconitemDoubleClicked)
self.iconlist.setStyleSheet("background-color:transparent; border: none;")
self.iconlist.setStyleSheet("QListWidget{ background-color:transparent; border: none;}")
self.iconlist.setHorizontalScrollBarPolicy(Qt.ScrollBarAlwaysOff)
self.nextButton = QToolButton()
self.nextButton.setIcon(newIcon("next", 40))
......@@ -307,12 +282,11 @@ class MainWindow(QMainWindow, WindowMixin):
hlayout.addWidget(self.iconlist)
hlayout.addWidget(self.nextButton)
iconListContainer = QWidget()
iconListContainer.setLayout(hlayout)
iconListContainer.setFixedHeight(100)
########### Canvas ###########
# ================== Canvas ==================
self.canvas = Canvas(parent=self)
self.canvas.zoomRequest.connect(self.zoomRequest)
self.canvas.setDrawingShapeToSquare(settings.get(SETTING_DRAW_SQUARE, False))
......@@ -335,32 +309,17 @@ class MainWindow(QMainWindow, WindowMixin):
centerLayout = QVBoxLayout()
centerLayout.setContentsMargins(0, 0, 0, 0)
centerLayout.addWidget(scroll)
#centerLayout.addWidget(self.icondock)
centerLayout.addWidget(iconListContainer,0,Qt.AlignCenter)
centercontainer = QWidget()
centercontainer.setLayout(centerLayout)
# self.scrolldock = QDockWidget('WorkSpace',self)
# self.scrolldock.setObjectName('WorkSpace')
# self.scrolldock.setWidget(centercontainer)
# self.scrolldock.setFeatures(QDockWidget.NoDockWidgetFeatures)
# orititle = self.scrolldock.titleBarWidget()
# tmpwidget = QWidget()
# self.scrolldock.setTitleBarWidget(tmpwidget)
# del orititle
self.setCentralWidget(centercontainer) #self.scrolldock
self.addDockWidget(Qt.RightDockWidgetArea, self.dock)
centerLayout.addWidget(iconListContainer, 0, Qt.AlignCenter)
centerContainer = QWidget()
centerContainer.setLayout(centerLayout)
# self.filedock.setFeatures(QDockWidget.DockWidgetFloatable)
self.filedock.setFeatures(self.filedock.features() ^ QDockWidget.DockWidgetFloatable)
self.dockFeatures = QDockWidget.DockWidgetClosable | QDockWidget.DockWidgetFloatable
self.dock.setFeatures(self.dock.features() ^ self.dockFeatures)
self.setCentralWidget(centerContainer)
self.addDockWidget(Qt.RightDockWidgetArea, self.dock)
self.filedock.setFeatures(QDockWidget.NoDockWidgetFeatures)
self.dock.setFeatures(QDockWidget.DockWidgetClosable | QDockWidget.DockWidgetFloatable)
self.fileDock.setFeatures(QDockWidget.NoDockWidgetFeatures)
###### Actions #######
# ================== Actions ==================
action = partial(newAction, self)
quit = action(getStr('quit'), self.close,
'Ctrl+Q', 'quit', getStr('quitApp'))
......@@ -369,13 +328,13 @@ class MainWindow(QMainWindow, WindowMixin):
'Ctrl+u', 'open', getStr('openDir'))
open_dataset_dir = action(getStr('openDatasetDir'), self.openDatasetDirDialog,
'Ctrl+p', 'open', getStr('openDatasetDir'), enabled=False)
'Ctrl+p', 'open', getStr('openDatasetDir'), enabled=False)
save = action(getStr('save'), self.saveFile,
'Ctrl+V', 'verify', getStr('saveDetail'), enabled=False)
alcm = action(getStr('choosemodel'), self.autolcm,
'Ctrl+M', 'next', getStr('tipchoosemodel'))
'Ctrl+M', 'next', getStr('tipchoosemodel'))
deleteImg = action(getStr('deleteImg'), self.deleteImg, 'Ctrl+Shift+D', 'close', getStr('deleteImgDetail'),
enabled=True)
......@@ -394,8 +353,8 @@ class MainWindow(QMainWindow, WindowMixin):
'w', 'objects', getStr('crtBoxDetail'), enabled=False)
delete = action(getStr('delBox'), self.deleteSelectedShape,
'backspace', 'delete', getStr('delBoxDetail'), enabled=False)
'Alt+X', 'delete', getStr('delBoxDetail'), enabled=False)
copy = action(getStr('dupBox'), self.copySelectedShape,
'Ctrl+C', 'copy', getStr('dupBoxDetail'),
enabled=False)
......@@ -406,7 +365,6 @@ class MainWindow(QMainWindow, WindowMixin):
showAll = action(getStr('showBox'), partial(self.togglePolygons, True),
'Ctrl+A', 'hide', getStr('showAllBoxDetail'),
enabled=False)
help = action(getStr('tutorial'), self.showTutorialDialog, None, 'help', getStr('tutorialDetail'))
showInfo = action(getStr('info'), self.showInfoDialog, None, 'help', getStr('info'))
......@@ -448,12 +406,12 @@ class MainWindow(QMainWindow, WindowMixin):
'Ctrl+E', 'edit', getStr('editLabelDetail'),
enabled=False)
######## New actions #######
# ================== New Actions ==================
AutoRec = action(getStr('autoRecognition'), self.autoRecognition,
'', 'Auto', getStr('autoRecognition'), enabled=False)
'', 'Auto', getStr('autoRecognition'), enabled=False)
reRec = action(getStr('reRecognition'), self.reRecognition,
'Ctrl+Shift+R', 'reRec', getStr('reRecognition'), enabled=False)
'Ctrl+Shift+R', 'reRec', getStr('reRecognition'), enabled=False)
singleRere = action(getStr('singleRe'), self.singleRerecognition,
'Ctrl+R', 'reRec', getStr('singleRe'), enabled=False)
......@@ -462,23 +420,23 @@ class MainWindow(QMainWindow, WindowMixin):
'q', 'new', getStr('creatPolygon'), enabled=True)
saveRec = action(getStr('saveRec'), self.saveRecResult,
'', 'save', getStr('saveRec'), enabled=False)
'', 'save', getStr('saveRec'), enabled=False)
saveLabel = action(getStr('saveLabel'), self.saveLabelFile, #
'Ctrl+S', 'save', getStr('saveLabel'), enabled=False)
saveLabel = action(getStr('saveLabel'), self.saveLabelFile, #
'Ctrl+S', 'save', getStr('saveLabel'), enabled=False)
undoLastPoint = action(getStr("undoLastPoint"), self.canvas.undoLastPoint,
'Ctrl+Z', "undo", getStr("undoLastPoint"), enabled=False)
rotateLeft = action(getStr("rotateLeft"), partial(self.rotateImgAction,1),
'Ctrl+Alt+L', "rotateLeft", getStr("rotateLeft"), enabled=False)
rotateLeft = action(getStr("rotateLeft"), partial(self.rotateImgAction, 1),
'Ctrl+Alt+L', "rotateLeft", getStr("rotateLeft"), enabled=False)
rotateRight = action(getStr("rotateRight"), partial(self.rotateImgAction,-1),
'Ctrl+Alt+R', "rotateRight", getStr("rotateRight"), enabled=False)
rotateRight = action(getStr("rotateRight"), partial(self.rotateImgAction, -1),
'Ctrl+Alt+R', "rotateRight", getStr("rotateRight"), enabled=False)
undo = action(getStr("undo"), self.undoShapeEdit,
'Ctrl+Z', "undo", getStr("undo"), enabled=False)
lock = action(getStr("lockBox"), self.lockSelectedShape,
None, "lock", getStr("lockBoxDetail"),
enabled=False)
......@@ -492,7 +450,7 @@ class MainWindow(QMainWindow, WindowMixin):
# self.preButton.setDefaultAction(openPrevImg)
# self.nextButton.setDefaultAction(openNextImg)
############# Zoom layout ##############
# ================== Zoom layout ==================
zoomLayout = QHBoxLayout()
zoomLayout.addStretch()
self.zoominButton = QToolButton()
......@@ -519,7 +477,6 @@ class MainWindow(QMainWindow, WindowMixin):
icon='color', tip=getStr('shapeFillColorDetail'),
enabled=False)
# Label list context menu.
labelMenu = QMenu()
addActions(labelMenu, (edit, delete))
......@@ -535,39 +492,36 @@ class MainWindow(QMainWindow, WindowMixin):
self.drawSquaresOption.triggered.connect(self.toogleDrawSquare)
# Store actions for further handling.
self.actions = struct(save=save, resetAll=resetAll, deleteImg=deleteImg,
self.actions = struct(save=save, resetAll=resetAll, deleteImg=deleteImg,
lineColor=color1, create=create, delete=delete, edit=edit, copy=copy,
saveRec=saveRec, singleRere=singleRere,AutoRec=AutoRec,reRec=reRec,
saveRec=saveRec, singleRere=singleRere, AutoRec=AutoRec, reRec=reRec,
createMode=createMode, editMode=editMode,
shapeLineColor=shapeLineColor, shapeFillColor=shapeFillColor,
zoom=zoom, zoomIn=zoomIn, zoomOut=zoomOut, zoomOrg=zoomOrg,
fitWindow=fitWindow, fitWidth=fitWidth,
zoomActions=zoomActions, saveLabel=saveLabel,
undo=undo, undoLastPoint=undoLastPoint,open_dataset_dir=open_dataset_dir,
rotateLeft=rotateLeft,rotateRight=rotateRight,lock=lock,
fileMenuActions=(
opendir, open_dataset_dir, saveLabel, resetAll, quit),
undo=undo, undoLastPoint=undoLastPoint, open_dataset_dir=open_dataset_dir,
rotateLeft=rotateLeft, rotateRight=rotateRight, lock=lock,
fileMenuActions=(opendir, open_dataset_dir, saveLabel, resetAll, quit),
beginner=(), advanced=(),
editMenu=(createpoly, edit, copy, delete,singleRere,None, undo, undoLastPoint,
None, rotateLeft, rotateRight, None, color1, self.drawSquaresOption,lock),
beginnerContext=(create, edit, copy, delete, singleRere, rotateLeft, rotateRight,lock),
editMenu=(createpoly, edit, copy, delete, singleRere, None, undo, undoLastPoint,
None, rotateLeft, rotateRight, None, color1, self.drawSquaresOption, lock),
beginnerContext=(create, edit, copy, delete, singleRere, rotateLeft, rotateRight, lock),
advancedContext=(createMode, editMode, edit, copy,
delete, shapeLineColor, shapeFillColor),
onLoadActive=(
create, createMode, editMode),
onLoadActive=(create, createMode, editMode),
onShapesPresent=(hideAll, showAll))
# menus
self.menus = struct(
file=self.menu('&'+getStr('mfile')),
edit=self.menu('&'+getStr('medit')),
view=self.menu('&'+getStr('mview')),
file=self.menu('&' + getStr('mfile')),
edit=self.menu('&' + getStr('medit')),
view=self.menu('&' + getStr('mview')),
autolabel=self.menu('&PaddleOCR'),
help=self.menu('&'+getStr('mhelp')),
help=self.menu('&' + getStr('mhelp')),
recentFiles=QMenu('Open &Recent'),
labelList=labelMenu)
self.lastLabel = None
# Add option to enable/disable labels being displayed at the top of bounding boxes
self.displayLabelOption = QAction(getStr('displayLabel'), self)
......@@ -588,33 +542,30 @@ class MainWindow(QMainWindow, WindowMixin):
self.autoSaveOption.triggered.connect(self.autoSaveFunc)
addActions(self.menus.file,
(opendir, open_dataset_dir, None, saveLabel, saveRec, self.autoSaveOption, None, resetAll, deleteImg, quit))
(opendir, open_dataset_dir, None, saveLabel, saveRec, self.autoSaveOption, None, resetAll, deleteImg,
quit))
addActions(self.menus.help, (showKeys,showSteps, showInfo))
addActions(self.menus.help, (showKeys, showSteps, showInfo))
addActions(self.menus.view, (
self.displayLabelOption, self.labelDialogOption,
None,
None,
hideAll, showAll, None,
zoomIn, zoomOut, zoomOrg, None,
fitWindow, fitWidth))
addActions(self.menus.autolabel, (AutoRec, reRec, alcm, None, help)) #
addActions(self.menus.autolabel, (AutoRec, reRec, alcm, None, help))
self.menus.file.aboutToShow.connect(self.updateFileMenu)
# Custom context menu for the canvas widget:
addActions(self.canvas.menus[0], self.actions.beginnerContext)
#addActions(self.canvas.menus[1], (
# action('&Copy here', self.copyShape),
# action('&Move here', self.moveShape)))
self.statusBar().showMessage('%s started.' % __appname__)
self.statusBar().show()
# Application state.
self.image = QImage()
self.filePath = ustr(defaultFilename)
self.filePath = ustr(default_filename)
self.lastOpenDir = None
self.recentFiles = []
self.maxRecent = 7
......@@ -625,7 +576,7 @@ class MainWindow(QMainWindow, WindowMixin):
# Add Chris
self.difficult = False
## Fix the compatible issue for qt4 and qt5. Convert the QStringList to python list
# Fix the compatible issue for qt4 and qt5. Convert the QStringList to python list
if settings.get(SETTING_RECENT_FILES):
if have_qstring():
recentFileQStringList = settings.get(SETTING_RECENT_FILES)
......@@ -654,7 +605,6 @@ class MainWindow(QMainWindow, WindowMixin):
# Add chris
Shape.difficult = self.difficult
# ADD:
# Populate the File menu dynamically.
self.updateFileMenu()
......@@ -678,6 +628,12 @@ class MainWindow(QMainWindow, WindowMixin):
if self.filePath and os.path.isdir(self.filePath):
self.openDirDialog(dirpath=self.filePath, silent=True)
def menu(self, title, actions=None):
menu = self.menuBar().addMenu(title)
if actions:
addActions(menu, actions)
return menu
def keyReleaseEvent(self, event):
if event.key() == Qt.Key_Control:
self.canvas.setDrawingShapeToSquare(False)
......@@ -687,11 +643,9 @@ class MainWindow(QMainWindow, WindowMixin):
# Draw rectangle if Ctrl is pressed
self.canvas.setDrawingShapeToSquare(True)
def noShapes(self):
return not self.itemsToShapes
def populateModeActions(self):
self.canvas.menus[0].clear()
addActions(self.canvas.menus[0], self.actions.beginnerContext)
......@@ -699,7 +653,6 @@ class MainWindow(QMainWindow, WindowMixin):
actions = (self.actions.create,) # if self.beginner() else (self.actions.createMode, self.actions.editMode)
addActions(self.menus.edit, actions + self.actions.editMenu)
def setDirty(self):
self.dirty = True
self.actions.save.setEnabled(True)
......@@ -813,10 +766,11 @@ class MainWindow(QMainWindow, WindowMixin):
def rotateImgWarn(self):
if self.lang == 'ch':
self.msgBox.warning (self, "提示", "\n 该图片已经有标注框,旋转操作会打乱标注,建议清除标注框后旋转。")
self.msgBox.warning(self, "提示", "\n 该图片已经有标注框,旋转操作会打乱标注,建议清除标注框后旋转。")
else:
self.msgBox.warning (self, "Warn", "\n The picture already has a label box, and rotation will disrupt the label.\
It is recommended to clear the label box and rotate it.")
self.msgBox.warning(self, "Warn", "\n The picture already has a label box, "
"and rotation will disrupt the label. "
"It is recommended to clear the label box and rotate it.")
def rotateImgAction(self, k=1, _value=False):
......@@ -891,14 +845,13 @@ class MainWindow(QMainWindow, WindowMixin):
self.setDirty()
self.updateComboBox()
######## detection box related functions #######
# =================== detection box related functions ===================
def boxItemChanged(self, item):
shape = self.itemsToShapesbox[item]
box = ast.literal_eval(item.text())
# print('shape in labelItemChanged is',shape.points)
if box != [(p.x(), p.y()) for p in shape.points]:
if box != [(int(p.x()), int(p.y())) for p in shape.points]:
# shape.points = box
shape.points = [QPointF(p[0], p[1]) for p in box]
......@@ -906,7 +859,7 @@ class MainWindow(QMainWindow, WindowMixin):
# shape.line_color = generateColorByText(shape.label)
self.setDirty()
else: # User probably changed item visibility
self.canvas.setShapeVisible(shape, True)#item.checkState() == Qt.Checked
self.canvas.setShapeVisible(shape, True) # item.checkState() == Qt.Checked
def editBox(self): # ADD
if not self.canvas.editing():
......@@ -956,11 +909,10 @@ class MainWindow(QMainWindow, WindowMixin):
def indexTo5Files(self, currIndex):
if currIndex < 2:
return self.mImgList[:5]
elif currIndex > len(self.mImgList)-3:
elif currIndex > len(self.mImgList) - 3:
return self.mImgList[-5:]
else:
return self.mImgList[currIndex - 2 : currIndex + 3]
return self.mImgList[currIndex - 2: currIndex + 3]
# Tzutalin 20160906 : Add file list and dock to move faster
def fileitemDoubleClicked(self, item=None):
......@@ -980,9 +932,8 @@ class MainWindow(QMainWindow, WindowMixin):
self.loadFile(filename)
def CanvasSizeChange(self):
if len(self.mImgList) > 0:
self.zoomWidget.setValue(self.zoomWidgetValue + self.imgsplider.value())
if len(self.mImgList) > 0 and self.imageSlider.hasFocus():
self.zoomWidget.setValue(self.imageSlider.value())
def shapeSelectionChanged(self, selected_shapes):
self._noSelectionSlot = True
......@@ -995,7 +946,7 @@ class MainWindow(QMainWindow, WindowMixin):
self.shapesToItems[shape].setSelected(True)
self.shapesToItemsbox[shape].setSelected(True)
self.labelList.scrollToItem(self.currentItem()) # QAbstractItemView.EnsureVisible
self.labelList.scrollToItem(self.currentItem()) # QAbstractItemView.EnsureVisible
self.BoxList.scrollToItem(self.currentBox())
self._noSelectionSlot = False
......@@ -1027,6 +978,10 @@ class MainWindow(QMainWindow, WindowMixin):
action.setEnabled(True)
self.updateComboBox()
# update show counting
self.BoxListDock.setWindowTitle(self.BoxListDockName + f" ({self.BoxList.count()})")
self.labelListDock.setWindowTitle(self.labelListDockName + f" ({self.labelList.count()})")
def remLabels(self, shapes):
if shapes is None:
# print('rm empty label')
......@@ -1048,7 +1003,7 @@ class MainWindow(QMainWindow, WindowMixin):
def loadLabels(self, shapes):
s = []
for label, points, line_color, fill_color, difficult in shapes:
shape = Shape(label=label,line_color=line_color)
shape = Shape(label=label, line_color=line_color)
for x, y in points:
# Ensure the labels are within the bounds of the image. If not, fix them.
......@@ -1058,7 +1013,7 @@ class MainWindow(QMainWindow, WindowMixin):
shape.addPoint(QPointF(x, y))
shape.difficult = difficult
#shape.locked = False
# shape.locked = False
shape.close()
s.append(shape)
......@@ -1071,12 +1026,11 @@ class MainWindow(QMainWindow, WindowMixin):
# shape.fill_color = QColor(*fill_color)
# else:
# shape.fill_color = generateColorByText(label)
self.addLabel(shape)
self.updateComboBox()
self.canvas.loadShapes(s)
def singleLabel(self, shape):
if shape is None:
......@@ -1112,13 +1066,13 @@ class MainWindow(QMainWindow, WindowMixin):
line_color=s.line_color.getRgb(),
fill_color=s.fill_color.getRgb(),
points=[(int(p.x()), int(p.y())) for p in s.points], # QPonitF
# add chris
# add chris
difficult=s.difficult) # bool
shapes = [] if mode == 'Auto' else \
[format_shape(shape) for shape in self.canvas.shapes if shape.line_color != DEFAULT_LOCK_COLOR]
# Can add differrent annotation formats here
for box in self.result_dic :
for box in self.result_dic:
trans_dic = {"label": box[1][0], "points": box[0], 'difficult': False}
if trans_dic["label"] == "" and mode == 'Auto':
continue
......@@ -1127,7 +1081,8 @@ class MainWindow(QMainWindow, WindowMixin):
try:
trans_dic = []
for box in shapes:
trans_dic.append({"transcription": box['label'], "points": box['points'], 'difficult': box['difficult']})
trans_dic.append(
{"transcription": box['label'], "points": box['points'], 'difficult': box['difficult']})
self.PPlabel[annotationFilePath] = trans_dic
if mode == 'Auto':
self.Cachelabel[annotationFilePath] = trans_dic
......@@ -1145,8 +1100,7 @@ class MainWindow(QMainWindow, WindowMixin):
for shape in self.canvas.copySelectedShape():
self.addLabel(shape)
# fix copy and delete
#self.shapeSelectionChanged(True)
# self.shapeSelectionChanged(True)
def labelSelectionChanged(self):
if self._noSelectionSlot:
......@@ -1160,10 +1114,9 @@ class MainWindow(QMainWindow, WindowMixin):
else:
self.canvas.deSelectShape()
def boxSelectionChanged(self):
if self._noSelectionSlot:
#self.BoxList.scrollToItem(self.currentBox(), QAbstractItemView.PositionAtCenter)
# self.BoxList.scrollToItem(self.currentBox(), QAbstractItemView.PositionAtCenter)
return
if self.canvas.editing():
selected_shapes = []
......@@ -1174,7 +1127,6 @@ class MainWindow(QMainWindow, WindowMixin):
else:
self.canvas.deSelectShape()
def labelItemChanged(self, item):
shape = self.itemsToShapes[item]
label = item.text()
......@@ -1182,7 +1134,7 @@ class MainWindow(QMainWindow, WindowMixin):
shape.label = item.text()
# shape.line_color = generateColorByText(shape.label)
self.setDirty()
elif not ((item.checkState()== Qt.Unchecked) ^ (not shape.difficult)):
elif not ((item.checkState() == Qt.Unchecked) ^ (not shape.difficult)):
shape.difficult = True if item.checkState() == Qt.Unchecked else False
self.setDirty()
else: # User probably changed item visibility
......@@ -1208,7 +1160,7 @@ class MainWindow(QMainWindow, WindowMixin):
if text is not None:
self.prevLabelText = self.stringBundle.getString('tempLabel')
# generate_color = generateColorByText(text)
shape = self.canvas.setLastLabel(text, None, None)#generate_color, generate_color
shape = self.canvas.setLastLabel(text, None, None) # generate_color, generate_color
self.addLabel(shape)
if self.beginner(): # Switch to edit mode.
self.canvas.setEditing(True)
......@@ -1236,6 +1188,7 @@ class MainWindow(QMainWindow, WindowMixin):
def addZoom(self, increment=10):
self.setZoom(self.zoomWidget.value() + increment)
self.imageSlider.setValue(self.zoomWidget.value() + increment) # set zoom slider value
def zoomRequest(self, delta):
# get the current scrollbar positions
......@@ -1321,17 +1274,16 @@ class MainWindow(QMainWindow, WindowMixin):
# unicodeFilePath = os.path.abspath(unicodeFilePath)
# Tzutalin 20160906 : Add file list and dock to move faster
# Highlight the file item
if unicodeFilePath and self.fileListWidget.count() > 0:
if unicodeFilePath in self.mImgList:
index = self.mImgList.index(unicodeFilePath)
fileWidgetItem = self.fileListWidget.item(index)
print('unicodeFilePath is', unicodeFilePath)
fileWidgetItem.setSelected(True)
###
self.iconlist.clear()
self.additems5(None)
for i in range(5):
item_tooltip = self.iconlist.item(i).toolTip()
# print(i,"---",item_tooltip)
......@@ -1382,12 +1334,21 @@ class MainWindow(QMainWindow, WindowMixin):
self.showBoundingBoxFromPPlabel(filePath)
self.setWindowTitle(__appname__ + ' ' + filePath)
# Default : select last item if there is at least one item
if self.labelList.count():
self.labelList.setCurrentItem(self.labelList.item(self.labelList.count() - 1))
self.labelList.item(self.labelList.count() - 1).setSelected(True)
# show file list image count
select_indexes = self.fileListWidget.selectedIndexes()
if len(select_indexes) > 0:
self.fileDock.setWindowTitle(self.fileListName + f" ({select_indexes[0].row() + 1}"
f"/{self.fileListWidget.count()})")
# update show counting
self.BoxListDock.setWindowTitle(self.BoxListDockName + f" ({self.BoxList.count()})")
self.labelListDock.setWindowTitle(self.labelListDockName + f" ({self.labelList.count()})")
self.canvas.setFocus(True)
return True
return False
......@@ -1395,24 +1356,23 @@ class MainWindow(QMainWindow, WindowMixin):
def showBoundingBoxFromPPlabel(self, filePath):
width, height = self.image.width(), self.image.height()
imgidx = self.getImglabelidx(filePath)
shapes =[]
#box['ratio'] of the shapes saved in lockedShapes contains the ratio of the
shapes = []
# box['ratio'] of the shapes saved in lockedShapes contains the ratio of the
# four corner coordinates of the shapes to the height and width of the image
for box in self.canvas.lockedShapes:
if self.canvas.isInTheSameImage:
shapes.append((box['transcription'], [[s[0]*width,s[1]*height]for s in box['ratio']],
DEFAULT_LOCK_COLOR, None, box['difficult']))
shapes.append((box['transcription'], [[s[0] * width, s[1] * height] for s in box['ratio']],
DEFAULT_LOCK_COLOR, None, box['difficult']))
else:
shapes.append(('锁定框:待检测', [[s[0]*width,s[1]*height]for s in box['ratio']],
DEFAULT_LOCK_COLOR, None, box['difficult']))
shapes.append(('锁定框:待检测', [[s[0] * width, s[1] * height] for s in box['ratio']],
DEFAULT_LOCK_COLOR, None, box['difficult']))
if imgidx in self.PPlabel.keys():
for box in self.PPlabel[imgidx]:
shapes.append((box['transcription'], box['points'], None, None, box['difficult']))
self.loadLabels(shapes)
self.canvas.verified = False
def validFilestate(self, filePath):
if filePath not in self.fileStatedict.keys():
return None
......@@ -1423,7 +1383,7 @@ class MainWindow(QMainWindow, WindowMixin):
def resizeEvent(self, event):
if self.canvas and not self.image.isNull() \
and self.zoomMode != self.MANUAL_ZOOM:
and self.zoomMode != self.MANUAL_ZOOM:
self.adjustScale()
super(MainWindow, self).resizeEvent(event)
......@@ -1441,7 +1401,7 @@ class MainWindow(QMainWindow, WindowMixin):
"""Figure out the size of the pixmap in order to fit the main widget."""
e = 2.0 # So that no scrollbars are generated.
w1 = self.centralWidget().width() - e
h1 = self.centralWidget().height() - e -110
h1 = self.centralWidget().height() - e - 110
a1 = w1 / h1
# Calculate a new scale value based on the pixmap's aspect ratio.
w2 = self.canvas.pixmap.width() - 0.0
......@@ -1492,7 +1452,7 @@ class MainWindow(QMainWindow, WindowMixin):
def loadRecent(self, filename):
if self.mayContinue():
print(filename,"======")
print(filename, "======")
self.loadFile(filename)
def scanAllImages(self, folderPath):
......@@ -1507,8 +1467,6 @@ class MainWindow(QMainWindow, WindowMixin):
natural_sort(images, key=lambda x: x.lower())
return images
def openDirDialog(self, _value=False, dirpath=None, silent=False):
if not self.mayContinue():
return
......@@ -1520,15 +1478,15 @@ class MainWindow(QMainWindow, WindowMixin):
defaultOpenDirPath = os.path.dirname(self.filePath) if self.filePath else '.'
if silent != True:
targetDirPath = ustr(QFileDialog.getExistingDirectory(self,
'%s - Open Directory' % __appname__,
defaultOpenDirPath,
QFileDialog.ShowDirsOnly | QFileDialog.DontResolveSymlinks))
'%s - Open Directory' % __appname__,
defaultOpenDirPath,
QFileDialog.ShowDirsOnly | QFileDialog.DontResolveSymlinks))
else:
targetDirPath = ustr(defaultOpenDirPath)
self.lastOpenDir = targetDirPath
self.importDirImages(targetDirPath)
def openDatasetDirDialog(self,):
def openDatasetDirDialog(self):
if self.lastOpenDir and os.path.exists(self.lastOpenDir):
if platform.system() == 'Windows':
os.startfile(self.lastOpenDir)
......@@ -1540,12 +1498,13 @@ class MainWindow(QMainWindow, WindowMixin):
if self.lang == 'ch':
self.msgBox.warning(self, "提示", "\n 原文件夹已不存在,请从新选择数据集路径!")
else:
self.msgBox.warning(self, "Warn", "\n The original folder no longer exists, please choose the data set path again!")
self.msgBox.warning(self, "Warn",
"\n The original folder no longer exists, please choose the data set path again!")
self.actions.open_dataset_dir.setEnabled(False)
defaultOpenDirPath = os.path.dirname(self.filePath) if self.filePath else '.'
def importDirImages(self, dirpath, isDelete = False):
def importDirImages(self, dirpath, isDelete=False):
if not self.mayContinue() or not dirpath:
return
if self.defaultSaveDir and self.defaultSaveDir != dirpath:
......@@ -1553,7 +1512,7 @@ class MainWindow(QMainWindow, WindowMixin):
if not isDelete:
self.loadFilestate(dirpath)
self.PPlabelpath = dirpath+ '/Label.txt'
self.PPlabelpath = dirpath + '/Label.txt'
self.PPlabel = self.loadLabelFile(self.PPlabelpath)
self.Cachelabelpath = dirpath + '/Cache.cach'
self.Cachelabel = self.loadLabelFile(self.Cachelabelpath)
......@@ -1562,7 +1521,6 @@ class MainWindow(QMainWindow, WindowMixin):
self.lastOpenDir = dirpath
self.dirname = dirpath
self.defaultSaveDir = dirpath
self.statusBar().showMessage('%s started. Annotation will be saved to %s' %
(__appname__, self.defaultSaveDir))
......@@ -1596,7 +1554,8 @@ class MainWindow(QMainWindow, WindowMixin):
self.actions.rotateLeft.setEnabled(True)
self.actions.rotateRight.setEnabled(True)
self.fileListWidget.setCurrentRow(0) # set list index to first
self.fileDock.setWindowTitle(self.fileListName + f" (1/{self.fileListWidget.count()})") # show image count
def openPrevImg(self, _value=False):
if len(self.mImgList) <= 0:
......@@ -1632,7 +1591,7 @@ class MainWindow(QMainWindow, WindowMixin):
else:
self.mImgList5 = self.indexTo5Files(currIndex)
if filename:
print('file name in openNext is ',filename)
print('file name in openNext is ', filename)
self.loadFile(filename)
def updateFileListIcon(self, filename):
......@@ -1644,30 +1603,6 @@ class MainWindow(QMainWindow, WindowMixin):
imgidx = self.getImglabelidx(self.filePath)
self._saveFile(imgidx, mode=mode)
def saveFileAs(self, _value=False):
assert not self.image.isNull(), "cannot save empty image"
self._saveFile(self.saveFileDialog())
def saveFileDialog(self, removeExt=True):
caption = '%s - Choose File' % __appname__
filters = 'File (*%s)' % LabelFile.suffix
openDialogPath = self.currentPath()
dlg = QFileDialog(self, caption, openDialogPath, filters)
dlg.setDefaultSuffix(LabelFile.suffix[1:])
dlg.setAcceptMode(QFileDialog.AcceptSave)
filenameWithoutExtension = os.path.splitext(self.filePath)[0]
dlg.selectFile(filenameWithoutExtension)
dlg.setOption(QFileDialog.DontUseNativeDialog, False)
if dlg.exec_():
fullFilePath = ustr(dlg.selectedFiles()[0])
if removeExt:
return os.path.splitext(fullFilePath)[0] # Return file path without the extension.
else:
return fullFilePath
return ''
def saveLockedShapes(self):
self.canvas.lockedShapes = []
self.canvas.selectedShapes = []
......@@ -1680,7 +1615,6 @@ class MainWindow(QMainWindow, WindowMixin):
self.canvas.selectedShapes.remove(s)
self.canvas.shapes.remove(s)
def _saveFile(self, annotationFilePath, mode='Manual'):
if len(self.canvas.lockedShapes) != 0:
self.saveLockedShapes()
......@@ -1690,9 +1624,9 @@ class MainWindow(QMainWindow, WindowMixin):
img = cv2.imread(self.filePath)
width, height = self.image.width(), self.image.height()
for shape in self.canvas.lockedShapes:
box = [[int(p[0]*width), int(p[1]*height)] for p in shape['ratio']]
box = [[int(p[0] * width), int(p[1] * height)] for p in shape['ratio']]
assert len(box) == 4
result = [(shape['transcription'],1)]
result = [(shape['transcription'], 1)]
result.insert(0, box)
self.result_dic_locked.append(result)
self.result_dic += self.result_dic_locked
......@@ -1706,7 +1640,7 @@ class MainWindow(QMainWindow, WindowMixin):
item.setIcon(newIcon('done'))
self.fileStatedict[self.filePath] = 1
if len(self.fileStatedict)%self.autoSaveNum ==0:
if len(self.fileStatedict) % self.autoSaveNum == 0:
self.saveFilestate()
self.savePPlabel(mode='Auto')
......@@ -1739,8 +1673,8 @@ class MainWindow(QMainWindow, WindowMixin):
if platform.system() == 'Windows':
from win32com.shell import shell, shellcon
shell.SHFileOperation((0, shellcon.FO_DELETE, deletePath, None,
shellcon.FOF_SILENT | shellcon.FOF_ALLOWUNDO | shellcon.FOF_NOCONFIRMATION,
None, None))
shellcon.FOF_SILENT | shellcon.FOF_ALLOWUNDO | shellcon.FOF_NOCONFIRMATION,
None, None))
# linux
elif platform.system() == 'Linux':
cmd = 'trash ' + deletePath
......@@ -1790,7 +1724,10 @@ class MainWindow(QMainWindow, WindowMixin):
def discardChangesDialog(self):
yes, no, cancel = QMessageBox.Yes, QMessageBox.No, QMessageBox.Cancel
msg = u'You have unsaved changes, would you like to save them and proceed?\nClick "No" to undo all changes.'
if self.lang == 'ch':
msg = u'您有未保存的变更, 您想保存再继续吗?\n点击 "No" 丢弃所有未保存的变更.'
else:
msg = u'You have unsaved changes, would you like to save them and proceed?\nClick "No" to undo all changes.'
return QMessageBox.warning(self, u'Attention', msg, yes | no | cancel)
def errorMessage(self, title, message):
......@@ -1817,6 +1754,8 @@ class MainWindow(QMainWindow, WindowMixin):
if self.noShapes():
for action in self.actions.onShapesPresent:
action.setEnabled(False)
self.BoxListDock.setWindowTitle(self.BoxListDockName + f" ({self.BoxList.count()})")
self.labelListDock.setWindowTitle(self.labelListDockName + f" ({self.labelList.count()})")
def chshapeLineColor(self):
color = self.colorDialog.getColor(self.lineColor, u'Choose line color',
......@@ -1853,7 +1792,6 @@ class MainWindow(QMainWindow, WindowMixin):
else:
self.labelHist.append(line)
def togglePaintLabelsOption(self):
for shape in self.canvas.shapes:
shape.paintLabel = self.displayLabelOption.isChecked()
......@@ -1882,7 +1820,7 @@ class MainWindow(QMainWindow, WindowMixin):
prelen = lentoken // 2
bfilename = prelen * " " + pfilename + (lentoken - prelen) * " "
# item = QListWidgetItem(QIcon(pix.scaled(100, 100, Qt.KeepAspectRatio, Qt.SmoothTransformation)),filename[:10])
item = QListWidgetItem(QIcon(pix.scaled(100, 100, Qt.IgnoreAspectRatio, Qt.FastTransformation)),pfilename)
item = QListWidgetItem(QIcon(pix.scaled(100, 100, Qt.IgnoreAspectRatio, Qt.FastTransformation)), pfilename)
# item.setForeground(QBrush(Qt.white))
item.setToolTip(file)
self.iconlist.addItem(item)
......@@ -1894,7 +1832,7 @@ class MainWindow(QMainWindow, WindowMixin):
self.iconlist.setMinimumWidth(owidth + 50)
def getImglabelidx(self, filePath):
if platform.system()=='Windows':
if platform.system() == 'Windows':
spliter = '\\'
else:
spliter = '/'
......@@ -1908,15 +1846,14 @@ class MainWindow(QMainWindow, WindowMixin):
uncheckedList = [i for i in self.mImgList if i not in self.fileStatedict.keys()]
self.autoDialog = AutoDialog(parent=self, ocr=self.ocr, mImgList=uncheckedList, lenbar=len(uncheckedList))
self.autoDialog.popUp()
self.currIndex=len(self.mImgList)
self.loadFile(self.filePath) # ADD
self.currIndex = len(self.mImgList) - 1
self.loadFile(self.filePath) # ADD
self.haveAutoReced = True
self.AutoRecognition.setEnabled(False)
self.actions.AutoRec.setEnabled(False)
self.setDirty()
self.saveCacheLabel()
def reRecognition(self):
img = cv2.imread(self.filePath)
# org_box = [dic['points'] for dic in self.PPlabel[self.getImglabelidx(self.filePath)]]
......@@ -1945,24 +1882,27 @@ class MainWindow(QMainWindow, WindowMixin):
print('Can not recognise the box')
if shape.line_color == DEFAULT_LOCK_COLOR:
shape.label = result[0][0]
self.result_dic_locked.append([box,(self.noLabelText,0)])
self.result_dic_locked.append([box, (self.noLabelText, 0)])
else:
self.result_dic.append([box,(self.noLabelText,0)])
self.result_dic.append([box, (self.noLabelText, 0)])
try:
if self.noLabelText == shape.label or result[1][0] == shape.label:
print('label no change')
else:
rec_flag += 1
except IndexError as e:
print('Can not recognise the box')
if (len(self.result_dic) > 0 and rec_flag > 0)or self.canvas.lockedShapes:
self.canvas.isInTheSameImage = True
print('Can not recognise the box')
if (len(self.result_dic) > 0 and rec_flag > 0) or self.canvas.lockedShapes:
self.canvas.isInTheSameImage = True
self.saveFile(mode='Auto')
self.loadFile(self.filePath)
self.canvas.isInTheSameImage = False
self.setDirty()
elif len(self.result_dic) == len(self.canvas.shapes) and rec_flag == 0:
QMessageBox.information(self, "Information", "The recognition result remains unchanged!")
if self.lang == 'ch':
QMessageBox.information(self, "Information", "识别结果保持一致!")
else:
QMessageBox.information(self, "Information", "The recognition result remains unchanged!")
else:
print('Can not recgonise in ', self.filePath)
else:
......@@ -2027,7 +1967,6 @@ class MainWindow(QMainWindow, WindowMixin):
self.AutoRecognition.setEnabled(True)
self.actions.AutoRec.setEnabled(True)
def modelChoose(self):
print(self.comboBox.currentText())
lg_idx = {'Chinese & English': 'ch', 'English': 'en', 'French': 'french', 'German': 'german',
......@@ -2054,14 +1993,12 @@ class MainWindow(QMainWindow, WindowMixin):
self.actions.saveLabel.setEnabled(True)
self.actions.saveRec.setEnabled(True)
def saveFilestate(self):
with open(self.fileStatepath, 'w', encoding='utf-8') as f:
for key in self.fileStatedict:
f.write(key + '\t')
f.write(str(self.fileStatedict[key]) + '\n')
def loadLabelFile(self, labelpath):
labeldict = {}
if not os.path.exists(labelpath):
......@@ -2080,8 +2017,7 @@ class MainWindow(QMainWindow, WindowMixin):
labeldict[file] = []
return labeldict
def savePPlabel(self,mode='Manual'):
def savePPlabel(self, mode='Manual'):
savedfile = [self.getImglabelidx(i) for i in self.fileStatedict.keys()]
with open(self.PPlabelpath, 'w', encoding='utf-8') as f:
for key in self.PPlabel:
......@@ -2089,8 +2025,11 @@ class MainWindow(QMainWindow, WindowMixin):
f.write(key + '\t')
f.write(json.dumps(self.PPlabel[key], ensure_ascii=False) + '\n')
if mode=='Manual':
msg = 'Images that have been checked are saved in '+ self.PPlabelpath
if mode == 'Manual':
if self.lang == 'ch':
msg = '已将检查过的图片标签保存在 ' + self.PPlabelpath + " 文件中"
else:
msg = 'Images that have been checked are saved in ' + self.PPlabelpath
QMessageBox.information(self, "Information", msg)
def saveCacheLabel(self):
......@@ -2122,17 +2061,19 @@ class MainWindow(QMainWindow, WindowMixin):
for i, label in enumerate(self.PPlabel[idx]):
if label['difficult']: continue
img_crop = get_rotate_crop_image(img, np.array(label['points'], np.float32))
img_name = os.path.splitext(os.path.basename(idx))[0] + '_crop_'+str(i)+'.jpg'
cv2.imwrite(crop_img_dir+img_name, img_crop)
f.write('crop_img/'+ img_name + '\t')
img_name = os.path.splitext(os.path.basename(idx))[0] + '_crop_' + str(i) + '.jpg'
cv2.imwrite(crop_img_dir + img_name, img_crop)
f.write('crop_img/' + img_name + '\t')
f.write(label['transcription'] + '\n')
except Exception as e:
ques_img.append(key)
print("Can not read image ",e)
print("Can not read image ", e)
if ques_img:
QMessageBox.information(self, "Information", "The following images can not be saved, "
"please check the image path and labels.\n" + "".join(str(i)+'\n' for i in ques_img))
QMessageBox.information(self, "Information", "Cropped images have been saved in "+str(crop_img_dir))
QMessageBox.information(self,
"Information",
"The following images can not be saved, please check the image path and labels.\n"
+ "".join(str(i) + '\n' for i in ques_img))
QMessageBox.information(self, "Information", "Cropped images have been saved in " + str(crop_img_dir))
def speedChoose(self):
if self.labelDialogOption.isChecked():
......@@ -2145,14 +2086,14 @@ class MainWindow(QMainWindow, WindowMixin):
def autoSaveFunc(self):
if self.autoSaveOption.isChecked():
self.autoSaveNum = 1 # Real auto_Save
self.autoSaveNum = 1 # Real auto_Save
try:
self.saveLabelFile()
except:
pass
print('The program will automatically save once after confirming an image')
else:
self.autoSaveNum = 5 # Used for backup
self.autoSaveNum = 5 # Used for backup
print('The program will automatically save once after confirming 5 images (default)')
def undoShapeEdit(self):
......@@ -2169,25 +2110,26 @@ class MainWindow(QMainWindow, WindowMixin):
self.labelList.clearSelection()
self._noSelectionSlot = False
self.canvas.loadShapes(shapes, replace=replace)
print("loadShapes")#1
print("loadShapes") # 1
def lockSelectedShape(self):
"""lock the selsected shapes.
"""lock the selected shapes.
Add self.selectedShapes to lock self.canvas.lockedShapes,
which holds the ratio of the four coordinates of the locked shapes
to the width and height of the image
"""
width, height = self.image.width(), self.image.height()
def format_shape(s):
return dict(label=s.label, # str
line_color=s.line_color.getRgb(),
fill_color=s.fill_color.getRgb(),
ratio=[[int(p.x())/width, int(p.y())/height] for p in s.points], # QPonitF
# add chris
ratio=[[int(p.x()) / width, int(p.y()) / height] for p in s.points], # QPonitF
# add chris
difficult=s.difficult) # bool
#lock
# lock
if len(self.canvas.lockedShapes) == 0:
for s in self.canvas.selectedShapes:
s.line_color = DEFAULT_LOCK_COLOR
......@@ -2199,7 +2141,7 @@ class MainWindow(QMainWindow, WindowMixin):
self.canvas.lockedShapes = trans_dic
self.actions.save.setEnabled(True)
#unlock
# unlock
else:
for s in self.canvas.shapes:
s.line_color = DEFAULT_LINE_COLOR
......@@ -2220,9 +2162,11 @@ def read(filename, default=None):
except:
return default
def str2bool(v):
return v.lower() in ("true", "t", "1")
def get_main_app(argv=[]):
"""
Standard boilerplate Qt application code.
......@@ -2231,23 +2175,24 @@ def get_main_app(argv=[]):
app = QApplication(argv)
app.setApplicationName(__appname__)
app.setWindowIcon(newIcon("app"))
# Tzutalin 201705+: Accept extra agruments to change predefined class file
argparser = argparse.ArgumentParser()
argparser.add_argument("--lang", type=str, default='en', nargs="?")
argparser.add_argument("--gpu", type=str2bool, default=False, nargs="?")
argparser.add_argument("--predefined_classes_file",
default=os.path.join(os.path.dirname(__file__), "data", "predefined_classes.txt"),
nargs="?")
args = argparser.parse_args(argv[1:])
# Usage : labelImg.py image predefClassFile saveDir
win = MainWindow(lang=args.lang, gpu=args.gpu,
defaultPrefdefClassFile=args.predefined_classes_file)
# Tzutalin 201705+: Accept extra arguments to change predefined class file
arg_parser = argparse.ArgumentParser()
arg_parser.add_argument("--lang", type=str, default='en', nargs="?")
arg_parser.add_argument("--gpu", type=str2bool, default=True, nargs="?")
arg_parser.add_argument("--predefined_classes_file",
default=os.path.join(os.path.dirname(__file__), "data", "predefined_classes.txt"),
nargs="?")
args = arg_parser.parse_args(argv[1:])
win = MainWindow(lang=args.lang,
gpu=args.gpu,
default_predefined_class_file=args.predefined_classes_file)
win.show()
return app, win
def main():
'''construct main app and run it'''
"""construct main app and run it"""
app, _win = get_main_app(sys.argv)
return app.exec_()
......@@ -2259,5 +2204,5 @@ if __name__ == '__main__':
output = os.system('pyrcc5 -o libs/resources.py resources.qrc')
assert output == 0, "operate the cmd have some problems ,please check whether there is a in the lib " \
"directory resources.py "
import libs.resources
sys.exit(main())
......@@ -8,6 +8,8 @@ PPOCRLabel is a semi-automatic graphic annotation tool suitable for OCR field, w
### Recent Update
- 2022.01:(by [PeterH0323](https://github.com/peterh0323)
- Improve user experience: prompt for the number of files and labels, optimize interaction, and fix bugs such as only use CPU when inference
- 2021.11.17:
- Support install and start PPOCRLabel through the whl package (by [d2623587501](https://github.com/d2623587501))
- Dataset segmentation: Divide the annotation file into training, verification and testing parts (refer to section 3.5 below, by [MrCuiHao](https://github.com/MrCuiHao))
......@@ -110,7 +112,7 @@ python PPOCRLabel.py
6. Click 're-Recognition', model will rewrite ALL recognition results in ALL detection box<sup>[3]</sup>.
7. Double click the result in 'recognition result' list to manually change inaccurate recognition results.
7. Single click the result in 'recognition result' list to manually change inaccurate recognition results.
8. **Click "Check", the image status will switch to "√",then the program automatically jump to the next.**
......@@ -143,15 +145,17 @@ python PPOCRLabel.py
### 3.1 Shortcut keys
| Shortcut keys | Description |
| ------------------------ | ------------------------------------------------ |
|--------------------------|--------------------------------------------------|
| Ctrl + Shift + R | Re-recognize all the labels of the current image |
| W | Create a rect box |
| Q | Create a four-points box |
| X | Rotate the box anti-clockwise |
| C | Rotate the box clockwise |
| Ctrl + E | Edit label of the selected box |
| Ctrl + R | Re-recognize the selected box |
| Ctrl + C | Copy and paste the selected box |
| Ctrl + Left Mouse Button | Multi select the label box |
| Backspace | Delete the selected box |
| Alt + X | Delete the selected box |
| Ctrl + V | Check image |
| Ctrl + Shift + d | Delete image |
| D | Next image |
......
......@@ -8,6 +8,8 @@ PPOCRLabel是一款适用于OCR领域的半自动化图形标注工具,内置P
#### 近期更新
- 2022.01:(by [PeterH0323](https://github.com/peterh0323)
- 提升用户体验:新增文件与标记数目提示、优化交互、修复gpu使用等问题
- 2021.11.17:
- 新增支持通过whl包安装和启动PPOCRLabel(by [d2623587501](https://github.com/d2623587501)
- 标注数据集切分:对标注数据进行训练、验证与测试集划分(参考下方3.5节,by [MrCuiHao](https://github.com/MrCuiHao)
......@@ -102,7 +104,7 @@ python PPOCRLabel.py --lang ch
4. 手动标注:点击 “矩形标注”(推荐直接在英文模式下点击键盘中的 “W”),用户可对当前图片中模型未检出的部分进行手动绘制标记框。点击键盘Q,则使用四点标注模式(或点击“编辑” - “四点标注”),用户依次点击4个点后,双击左键表示标注完成。
5. 标记框绘制完成后,用户点击 “确认”,检测框会先被预分配一个 “待识别” 标签。
6. 重新识别:将图片中的所有检测画绘制/调整完成后,点击 “重新识别”,PPOCR模型会对当前图片中的**所有检测框**重新识别<sup>[3]</sup>
7. 内容更改:击识别结果,对不准确的识别结果进行手动更改。
7. 内容更改:击识别结果,对不准确的识别结果进行手动更改。
8. **确认标记:点击 “确认”,图片状态切换为 “√”,跳转至下一张。**
9. 删除:点击 “删除图像”,图片将会被删除至回收站。
10. 导出结果:用户可以通过菜单中“文件-导出标记结果”手动导出,同时也可以点击“文件 - 自动导出标记结果”开启自动导出。手动确认过的标记将会被存放在所打开图片文件夹下的*Label.txt*中。在菜单栏点击 “文件” - "导出识别结果"后,会将此类图片的识别训练数据保存在*crop_img*文件夹下,识别标签保存在*rec_gt.txt*<sup>[4]</sup>
......@@ -131,23 +133,25 @@ python PPOCRLabel.py --lang ch
### 3.1 快捷键
| 快捷键 | 说明 |
| ---------------- | ---------------------------- |
| Ctrl + shift + R | 对当前图片的所有标记重新识别 |
| W | 新建矩形框 |
| Q | 新建四点框 |
| Ctrl + E | 编辑所选框标签 |
| Ctrl + R | 重新识别所选标记 |
| 快捷键 | 说明 |
|------------------|----------------|
| Ctrl + shift + R | 对当前图片的所有标记重新识别 |
| W | 新建矩形框 |
| Q | 新建四点框 |
| X | 框逆时针旋转 |
| C | 框顺时针旋转 |
| Ctrl + E | 编辑所选框标签 |
| Ctrl + R | 重新识别所选标记 |
| Ctrl + C | 复制并粘贴选中的标记框 |
| Ctrl + 鼠标左键 | 多选标记框 |
| Backspace | 删除所选框 |
| Ctrl + V | 确认本张图片标记 |
| Ctrl + Shift + d | 删除本张图片 |
| D | 下一张图片 |
| A | 上一张图片 |
| Ctrl++ | 缩小 |
| Ctrl-- | 放大 |
| ↑→↓← | 移动标记框 |
| Ctrl + 鼠标左键 | 多选标记框 |
| Alt + X | 删除所选框 |
| Ctrl + V | 确认本张图片标记 |
| Ctrl + Shift + d | 删除本张图片 |
| D | 下一张图片 |
| A | 上一张图片 |
| Ctrl++ | 缩小 |
| Ctrl-- | 放大 |
| ↑→↓← | 移动标记框 |
### 3.2 内置模型
......
# Copyright (c) <2015-Present> Tzutalin
# Copyright (C) 2013 MIT, Computer Science and Artificial Intelligence Laboratory. Bryan Russell, Antonio Torralba,
# William T. Freeman. Permission is hereby granted, free of charge, to any person obtaining a copy of this software and
# associated documentation files (the "Software"), to deal in the Software without restriction, including without
# limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the
# Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
# The above copyright notice and this permission notice shall be included in all copies or substantial portions of
# the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT
# NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT
# SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF
# CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
# THE SOFTWARE.
import sys
try:
from PyQt5.QtWidgets import QWidget, QHBoxLayout, QComboBox
except ImportError:
# needed for py3+qt4
# Ref:
# http://pyqt.sourceforge.net/Docs/PyQt4/incompatible_apis.html
# http://stackoverflow.com/questions/21217399/pyqt4-qtcore-qvariant-object-instead-of-a-string
if sys.version_info.major >= 3:
import sip
sip.setapi('QVariant', 2)
from PyQt4.QtGui import QWidget, QHBoxLayout, QComboBox
class ComboBox(QWidget):
def __init__(self, parent=None, items=[]):
super(ComboBox, self).__init__(parent)
layout = QHBoxLayout()
self.cb = QComboBox()
self.items = items
self.cb.addItems(self.items)
self.cb.currentIndexChanged.connect(parent.comboSelectionChanged)
layout.addWidget(self.cb)
self.setLayout(layout)
def update_items(self, items):
self.items = items
self.cb.clear()
self.cb.addItems(self.items)
......@@ -6,6 +6,8 @@ except ImportError:
from PyQt4.QtGui import *
from PyQt4.QtCore import *
import time
import datetime
import json
import cv2
import numpy as np
......@@ -80,8 +82,9 @@ class AutoDialog(QDialog):
self.parent = parent
self.ocr = ocr
self.mImgList = mImgList
self.lender = lenbar
self.pb = QProgressBar()
self.pb.setRange(0, lenbar)
self.pb.setRange(0, self.lender)
self.pb.setValue(0)
layout = QVBoxLayout()
......@@ -108,10 +111,16 @@ class AutoDialog(QDialog):
self.thread_1.progressBarValue.connect(self.handleProgressBarSingal)
self.thread_1.listValue.connect(self.handleListWidgetSingal)
self.thread_1.endsignal.connect(self.handleEndsignalSignal)
self.time_start = time.time() # save start time
def handleProgressBarSingal(self, i):
self.pb.setValue(i)
# calculate time left of auto labeling
avg_time = (time.time() - self.time_start) / i # Use average time to prevent time fluctuations
time_left = str(datetime.timedelta(seconds=avg_time * (self.lender - i))).split(".")[0] # Remove microseconds
self.setWindowTitle("PPOCRLabel -- " + f"Time Left: {time_left}") # show
def handleListWidgetSingal(self, i):
self.listWidget.addItem(i)
titem = self.listWidget.item(self.listWidget.count() - 1)
......
......@@ -11,19 +11,13 @@
# CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
# THE SOFTWARE.
try:
from PyQt5.QtGui import *
from PyQt5.QtCore import *
from PyQt5.QtWidgets import *
except ImportError:
from PyQt4.QtGui import *
from PyQt4.QtCore import *
#from PyQt4.QtOpenGL import *
import copy
from PyQt5.QtCore import Qt, pyqtSignal, QPointF, QPoint
from PyQt5.QtGui import QPainter, QBrush, QColor, QPixmap
from PyQt5.QtWidgets import QWidget, QMenu, QApplication
from libs.shape import Shape
from libs.utils import distance
import copy
CURSOR_DEFAULT = Qt.ArrowCursor
CURSOR_POINT = Qt.PointingHandCursor
......@@ -31,8 +25,6 @@ CURSOR_DRAW = Qt.CrossCursor
CURSOR_MOVE = Qt.ClosedHandCursor
CURSOR_GRAB = Qt.OpenHandCursor
# class Canvas(QGLWidget):
class Canvas(QWidget):
zoomRequest = pyqtSignal(int)
......@@ -129,7 +121,6 @@ class Canvas(QWidget):
def selectedVertex(self):
return self.hVertex is not None
def mouseMoveEvent(self, ev):
"""Update line with last point and current coordinates."""
pos = self.transformPos(ev.pos())
......@@ -333,7 +324,6 @@ class Canvas(QWidget):
self.movingShape = False
def endMove(self, copy=False):
assert self.selectedShapes and self.selectedShapesCopy
assert len(self.selectedShapesCopy) == len(self.selectedShapes)
......@@ -410,7 +400,6 @@ class Canvas(QWidget):
self.selectionChanged.emit(shapes)
self.update()
def selectShapePoint(self, point, multiple_selection_mode):
"""Select the first shape created which contains this point."""
if self.selectedVertex(): # A vertex is marked for selection.
......@@ -494,7 +483,6 @@ class Canvas(QWidget):
else:
shape.moveVertexBy(index, shiftPos)
def boundedMoveShape(self, shapes, pos):
if type(shapes).__name__ != 'list': shapes = [shapes]
if self.outOfPixmap(pos):
......@@ -515,6 +503,7 @@ class Canvas(QWidget):
if dp:
for shape in shapes:
shape.moveBy(dp)
shape.close()
self.prevPoint = pos
return True
return False
......@@ -728,6 +717,31 @@ class Canvas(QWidget):
self.moveOnePixel('Up')
elif key == Qt.Key_Down and self.selectedShapes:
self.moveOnePixel('Down')
elif key == Qt.Key_X and self.selectedShapes:
for i in range(len(self.selectedShapes)):
self.selectedShape = self.selectedShapes[i]
if self.rotateOutOfBound(0.01):
continue
self.selectedShape.rotate(0.01)
self.shapeMoved.emit()
self.update()
elif key == Qt.Key_C and self.selectedShapes:
for i in range(len(self.selectedShapes)):
self.selectedShape = self.selectedShapes[i]
if self.rotateOutOfBound(-0.01):
continue
self.selectedShape.rotate(-0.01)
self.shapeMoved.emit()
self.update()
def rotateOutOfBound(self, angle):
for shape in range(len(self.selectedShapes)):
self.selectedShape = self.selectedShapes[shape]
for i, p in enumerate(self.selectedShape.points):
if self.outOfPixmap(self.selectedShape.rotatePoint(p, angle)):
return True
return False
def moveOnePixel(self, direction):
# print(self.selectedShape.points)
......
import sys, time
from PyQt5 import QtWidgets
from PyQt5.QtGui import *
from PyQt5.QtCore import *
from PyQt5.QtWidgets import *
# !/usr/bin/env python
# -*- coding: utf-8 -*-
from PyQt5.QtCore import QModelIndex
from PyQt5.QtWidgets import QListWidget
class EditInList(QListWidget):
def __init__(self):
super(EditInList,self).__init__()
# click to edit
self.clicked.connect(self.item_clicked)
super(EditInList, self).__init__()
self.edited_item = None
def item_clicked(self, modelindex: QModelIndex):
try:
if self.edited_item is not None:
self.closePersistentEditor(self.edited_item)
except:
self.edited_item = self.currentItem()
def item_clicked(self, modelindex: QModelIndex) -> None:
self.edited_item = self.currentItem()
self.closePersistentEditor(self.edited_item)
item = self.item(modelindex.row())
# time.sleep(0.2)
self.edited_item = item
self.openPersistentEditor(item)
# time.sleep(0.2)
self.editItem(item)
self.edited_item = self.item(modelindex.row())
self.openPersistentEditor(self.edited_item)
self.editItem(self.edited_item)
def mouseDoubleClickEvent(self, event):
# close edit
for i in range(self.count()):
self.closePersistentEditor(self.item(i))
pass
def leaveEvent(self, event):
# close edit
for i in range(self.count()):
self.closePersistentEditor(self.item(i))
\ No newline at end of file
self.closePersistentEditor(self.item(i))
......@@ -10,19 +10,14 @@
# SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF
# CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
# THE SOFTWARE.
#!/usr/bin/python
# !/usr/bin/python
# -*- coding: utf-8 -*-
import math
import sys
try:
from PyQt5.QtGui import *
from PyQt5.QtCore import *
except ImportError:
from PyQt4.QtGui import *
from PyQt4.QtCore import *
from PyQt5.QtCore import QPointF
from PyQt5.QtGui import QColor, QPen, QPainterPath, QFont
from libs.utils import distance
import sys
DEFAULT_LINE_COLOR = QColor(0, 255, 0, 128)
DEFAULT_FILL_COLOR = QColor(255, 0, 0, 128)
......@@ -59,6 +54,8 @@ class Shape(object):
self.difficult = difficult
self.paintLabel = paintLabel
self.locked = False
self.direction = 0
self.center = None
self._highlightIndex = None
self._highlightMode = self.NEAR_VERTEX
self._highlightSettings = {
......@@ -74,7 +71,24 @@ class Shape(object):
# is used for drawing the pending line a different color.
self.line_color = line_color
def rotate(self, theta):
for i, p in enumerate(self.points):
self.points[i] = self.rotatePoint(p, theta)
self.direction -= theta
self.direction = self.direction % (2 * math.pi)
def rotatePoint(self, p, theta):
order = p - self.center
cosTheta = math.cos(theta)
sinTheta = math.sin(theta)
pResx = cosTheta * order.x() + sinTheta * order.y()
pResy = - sinTheta * order.x() + cosTheta * order.y()
pRes = QPointF(self.center.x() + pResx, self.center.y() + pResy)
return pRes
def close(self):
self.center = QPointF((self.points[0].x() + self.points[2].x()) / 2,
(self.points[0].y() + self.points[2].y()) / 2)
self._closed = True
def reachMaxPoints(self):
......@@ -83,7 +97,9 @@ class Shape(object):
return False
def addPoint(self, point):
if not self.reachMaxPoints(): # 4个点时发出close信号
if self.reachMaxPoints():
self.close()
else:
self.points.append(point)
def popPoint(self):
......@@ -112,7 +128,7 @@ class Shape(object):
# Uncommenting the following line will draw 2 paths
# for the 1st vertex, and make it non-filled, which
# may be desirable.
#self.drawVertex(vrtx_path, 0)
# self.drawVertex(vrtx_path, 0)
for i, p in enumerate(self.points):
line_path.lineTo(p)
......@@ -136,9 +152,9 @@ class Shape(object):
font.setPointSize(8)
font.setBold(True)
painter.setFont(font)
if(self.label == None):
if self.label is None:
self.label = ""
if(min_y < MIN_Y_LABEL):
if min_y < MIN_Y_LABEL:
min_y += MIN_Y_LABEL
painter.drawText(min_x, min_y, self.label)
......@@ -198,6 +214,8 @@ class Shape(object):
def copy(self):
shape = Shape("%s" % self.label)
shape.points = [p for p in self.points]
shape.center = self.center
shape.direction = self.direction
shape.fill = self.fill
shape.selected = self.selected
shape._closed = self._closed
......
......@@ -50,7 +50,7 @@ PaddleOCR提供的可下载模型包括`推理模型`、`训练模型`、`预训
|模型名称|模型简介|配置文件|推理模型大小|下载地址|
| --- | --- | --- | --- | --- |
|ch_PP-OCRv2_rec_slim|【最新】slim量化版超轻量模型,支持中英文、数字识别|[ch_PP-OCRv2_rec.yml](../../configs/rec/ch_PP-OCRv2/ch_PP-OCRv2_rec.yml)| 9M |[推理模型](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_rec_slim_quant_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_rec_slim_quant_train.tar) |
|ch_PP-OCRv2_rec|【最新】原始超轻量模型,支持中英文、数字识别|[ch_PP-OCRv2_rec.yml](../../configs/rec/ch_PP-OCRv2/ch_PP-OCRv2_rec.yml)|8.5M|[推理模型](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_rec_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_rec_train.tar) |
|ch_PP-OCRv2_rec|【最新】原始超轻量模型,支持中英文、数字识别|[ch_PP-OCRv2_rec_distillation.yml](../../configs/rec/ch_PP-OCRv2/ch_PP-OCRv2_rec_distillation.yml)|8.5M|[推理模型](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_rec_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_rec_train.tar) |
|ch_ppocr_mobile_slim_v2.0_rec|slim裁剪量化版超轻量模型,支持中英文、数字识别|[rec_chinese_lite_train_v2.0.yml](../../configs/rec/ch_ppocr_v2.0/rec_chinese_lite_train_v2.0.yml)| 6M |[推理模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_rec_slim_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_rec_slim_train.tar) |
|ch_ppocr_mobile_v2.0_rec|原始超轻量模型,支持中英文、数字识别|[rec_chinese_lite_train_v2.0.yml](../../configs/rec/ch_ppocr_v2.0/rec_chinese_lite_train_v2.0.yml)|5.2M|[推理模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_rec_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_rec_train.tar) / [预训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_rec_pre.tar) |
|ch_ppocr_server_v2.0_rec|通用模型,支持中英文、数字识别|[rec_chinese_common_train_v2.0.yml](../../configs/rec/ch_ppocr_v2.0/rec_chinese_common_train_v2.0.yml)|94.8M|[推理模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_server_v2.0_rec_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_server_v2.0_rec_train.tar) / [预训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_server_v2.0_rec_pre.tar) |
......
......@@ -16,22 +16,24 @@ PaddleOCR希望可以通过AI的力量助力任何一位有梦想的开发者实
### 1.1 基于PaddleOCR的社区项目
- 【最新】 [FastOCRLabel](https://gitee.com/BaoJianQiang/FastOCRLabel):完整的C#版本标注工具 (@ [包建强](https://gitee.com/BaoJianQiang) )
#### 1.1.1 通用工具
- [DangoOCR离线版](https://github.com/PantsuDango/DangoOCR):通用型桌面级即时翻译工具 (@ [PantsuDango](https://github.com/PantsuDango))
- [scr2txt](https://github.com/lstwzd/scr2txt):截屏转文字工具 (@ [lstwzd](https://github.com/lstwzd))
- [AI Studio项目](https://aistudio.baidu.com/aistudio/projectdetail/1054614?channelType=0&channel=0):英文视频自动生成字幕( @ [叶月水狐](https://aistudio.baidu.com/aistudio/personalcenter/thirdview/322052))
#### 1.1.2 垂类场景工具
- [id_card_ocr](https://github.com/baseli/id_card_ocr):身份证复印件识别(@ [baseli](https://github.com/baseli))
- [Paddle_Table_Image_Reader](https://github.com/thunder95/Paddle_Table_Image_Reader):能看懂表格图片的数据助手(@ [thunder95](https://github.com/thunder95]))
#### 1.1.3 前后处理
- [paddleOCRCorrectOutputs](https://github.com/yuranusduke/paddleOCRCorrectOutputs):获取OCR识别结果的key-value(@ [yuranusduke](https://github.com/yuranusduke))
| 类别 | 项目 | 描述 | 开发者 |
| -------- | ------------------------------------------------------------ | -------------------------- | ------------------------------------------------------------ |
| 通用工具 | [FastOCRLabel](https://gitee.com/BaoJianQiang/FastOCRLabel) | 完整的C#版本标注GUI | [包建强](https://gitee.com/BaoJianQiang) |
| 通用工具 | [DangoOCR离线版](https://github.com/PantsuDango/DangoOCR) | 通用型桌面级即时翻译GUI | [PantsuDango](https://github.com/PantsuDango) |
| 通用工具 | [scr2txt](https://github.com/lstwzd/scr2txt) | 截屏转文字GUI | [lstwzd](https://github.com/lstwzd) |
| 通用工具 | [ocr_sdk](https://github.com/mymagicpower/AIAS/blob/main/1_image_sdks/text_recognition/ocr_sdk) | OCR java SDK工具箱 | [Calvin](https://github.com/mymagicpower) |
| 通用工具 | [iocr](https://github.com/mymagicpower/AIAS/blob/main/8_suite_hub/iocr) | IOCR 自定义模板识别(支持表格识别) | [Calvin](https://github.com/mymagicpower) |
| 通用工具 | [Lmdb Dataset Format Conversion Tool](https://github.com/OneYearIsEnough/PaddleOCR-Recog-LmdbDataset-Conversion) | 文本识别任务中lmdb数据格式转换工具 | [OneYearIsEnough](https://github.com/OneYearIsEnough) |
| 垂类工具 | [AI Studio项目](https://aistudio.baidu.com/aistudio/projectdetail/1054614?channelType=0&channel=0) | 英文视频自动生成字幕 | [叶月水狐](https://aistudio.baidu.com/aistudio/personalcenter/thirdview/322052) |
| 垂类工具 | [id_card_ocr](https://github.com/baseli/id_card_ocr) | 身份证复印件识别 | [baseli](https://github.com/baseli) |
| 垂类工具 | [Paddle_Table_Image_Reader](https://github.com/thunder95/Paddle_Table_Image_Reader) | 能看懂表格图片的数据助手 | [thunder95](https://github.com/thunder95]) |
| 垂类工具 | [AI Studio项目](https://aistudio.baidu.com/aistudio/projectdetail/3382897) | OCR流程中对手写体进行过滤 | [daassh](https://github.com/daassh) |
| 垂类工具 | [AI Studio项目](https://aistudio.baidu.com/aistudio/projectdetail/2803693) | 电表读数和编号识别 | [深渊上的坑](https://github.com/edencfc) |
| 前后处理 | [paddleOCRCorrectOutputs](https://github.com/yuranusduke/paddleOCRCorrectOutputs) | 获取OCR识别结果的key-value | [yuranusduke](https://github.com/yuranusduke) |
|前处理| [optlab](https://github.com/GreatV/optlab) |OCR前处理工具箱,基于Qt和Leptonica。|[GreatV](https://github.com/GreatV)|
|应用部署| [PaddleOCRSharp](https://github.com/raoyutian/PaddleOCRSharp) |PaddleOCR的.NET封装与应用部署。|[raoyutian](https://github.com/raoyutian/PaddleOCRSharp)|
|应用部署| [PaddleSharp](https://github.com/sdcb/PaddleSharp) |PaddleOCR的.NET封装与应用部署,支持跨平台、GPU|[sdcb](https://github.com/sdcb)|
| 学术前沿模型训练与推理 | [AI Studio项目](https://aistudio.baidu.com/aistudio/projectdetail/3397137) | StarNet-MobileNetV3算法–中文训练 | [xiaoyangyang2](https://github.com/xiaoyangyang2) |
### 1.2 为PaddleOCR新增功能
......
......@@ -43,8 +43,8 @@ Relationship of the above models is as follows.
|model name|description|config|model size|download|
| --- | --- | --- | --- | --- |
|ch_PP-OCRv2_rec_slim|[New] Slim qunatization with distillation lightweight model, supporting Chinese, English, multilingual text detection|[ch_PP-OCRv2_rec.yml](../../configs/rec/ch_PP-OCRv2/ch_PP-OCRv2_rec.yml)| 9M |[inference model](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_rec_slim_quant_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_rec_slim_quant_train.tar) |
|ch_PP-OCRv2_rec|[New] Original lightweight model, supporting Chinese, English, multilingual text detection|[ch_PP-OCRv2_rec.yml](../../configs/rec/ch_PP-OCRv2/ch_PP-OCRv2_rec.yml)|8.5M|[inference model](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_rec_train.tar) |
|ch_PP-OCRv2_rec_slim|[New] Slim qunatization with distillation lightweight model, supporting Chinese, English, multilingual text recognition|[ch_PP-OCRv2_rec.yml](../../configs/rec/ch_PP-OCRv2/ch_PP-OCRv2_rec.yml)| 9M |[inference model](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_rec_slim_quant_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_rec_slim_quant_train.tar) |
|ch_PP-OCRv2_rec|[New] Original lightweight model, supporting Chinese, English, multilingual text recognition|[ch_PP-OCRv2_rec_distillation.yml](../../configs/rec/ch_PP-OCRv2/ch_PP-OCRv2_rec_distillation.yml)|8.5M|[inference model](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_rec_train.tar) |
|ch_ppocr_mobile_slim_v2.0_rec|Slim pruned and quantized lightweight model, supporting Chinese, English and number recognition|[rec_chinese_lite_train_v2.0.yml](../../configs/rec/ch_ppocr_v2.0/rec_chinese_lite_train_v2.0.yml)| 6M | [inference model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_rec_slim_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_rec_slim_train.tar) |
|ch_ppocr_mobile_v2.0_rec|Original lightweight model, supporting Chinese, English and number recognition|[rec_chinese_lite_train_v2.0.yml](../../configs/rec/ch_ppocr_v2.0/rec_chinese_lite_train_v2.0.yml)|5.2M|[inference model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_rec_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_rec_train.tar) / [pre-trained model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_rec_pre.tar) |
|ch_ppocr_server_v2.0_rec|General model, supporting Chinese, English and number recognition|[rec_chinese_common_train_v2.0.yml](../../configs/rec/ch_ppocr_v2.0/rec_chinese_common_train_v2.0.yml)|94.8M|[inference model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_server_v2.0_rec_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_server_v2.0_rec_train.tar) / [pre-trained model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_server_v2.0_rec_pre.tar) |
......
doc/joinus.PNG

189 KB | W: | H:

doc/joinus.PNG

199 KB | W: | H:

doc/joinus.PNG
doc/joinus.PNG
doc/joinus.PNG
doc/joinus.PNG
  • 2-up
  • Swipe
  • Onion skin
......@@ -81,7 +81,7 @@
"\n",
"如果对某些层使用更小的学习率学习,静态图里还不是很方便,一个方法是在参数初始化的时候,给权重的属性设置固定的学习率,参考:https://www.paddlepaddle.org.cn/documentation/docs/zh/develop/api/paddle/fluid/param_attr/ParamAttr_cn.html#paramattr\n",
"\n",
"实际上我们实验发现,直接加载模型去fine-tune,不设置某些层不同学习率,效果也都不错\n",
"实际上我们实验发现,直接加载模型去fine-tune,不设置某些层不同学习率,效果也都不错\n",
"\n",
"**1.11 DB的预处理部分,图片的长和宽为什么要处理成32的倍数?**\n",
"\n",
......@@ -95,7 +95,7 @@
"\n",
"**1.13 PP-OCR检测效果不好,该如何优化?**\n",
"\n",
"A: 具体问题具体分析:\n",
"**A**: 具体问题具体分析:\n",
"- 如果在你的场景上检测效果不可用,首选是在你的数据上做finetune训练;\n",
"- 如果图像过大,文字过于密集,建议不要过度压缩图像,可以尝试修改检测预处理的resize逻辑,防止图像被过度压缩;\n",
"- 检测框大小过于紧贴文字或检测框过大,可以调整db_unclip_ratio这个参数,加大参数可以扩大检测框,减小参数可以减小检测框大小;\n",
......@@ -123,8 +123,8 @@
"\n",
"**A**:GPU加速预测推荐使用TensorRT。\n",
"- 1. 从[链接](https://paddleinference.paddlepaddle.org.cn/master/user_guides/download_lib.html)下载带TensorRT的Paddle安装包或者预测库。\n",
"- 2. 从Nvidia官网下载TensorRT版本,注意下载的TensorRT版本与paddle安装包中编译的TensorRT版本一致。\n",
"- 3. 设置环境变量LD_LIBRARY_PATH,指向TensorRT的lib文件夹\n",
"- 2. 从Nvidia官网下载[TensorRT](https://developer.nvidia.com/tensorrt),注意下载的TensorRT版本与paddle安装包中编译的TensorRT版本一致。\n",
"- 3. 设置环境变量`LD_LIBRARY_PATH`,指向TensorRT的lib文件夹\n",
"```\n",
"export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:<TensorRT-${version}/lib>\n",
"```\n",
......
This source diff could not be displayed because it is too large. You can view the blob instead.
......@@ -6,7 +6,7 @@
"collapsed": false
},
"source": [
"# OCR七日课之文本检测综述\n"
"# 文本检测算法理论\n"
]
},
{
......@@ -15,11 +15,11 @@
"collapsed": false
},
"source": [
"## 1. 文本检测\n",
"## 1 文本检测\n",
"\n",
"文本检测任务是找出图像或视频中的文字位置。不同于目标检测任务,目标检测不仅要解决定位问题,还要解决目标分类问题。\n",
"\n",
"文本在图像中的表现形式可以视为一种‘目标,通用的目标检测的方法也适用于文本检测,从任务本身上来看:\n",
"文本在图像中的表现形式可以视为一种‘目标,通用的目标检测的方法也适用于文本检测,从任务本身上来看:\n",
"\n",
"- 目标检测:给定图像或者视频,找出目标的位置(box),并给出目标的类别;\n",
"- 文本检测:给定输入图像或者视频,找出文本的区域,可以是单字符位置或者整个文本行位置;\n",
......@@ -41,14 +41,14 @@
"1. 自然场景中文本具有多样性:文本检测受到文字颜色、大小、字体、形状、方向、语言、以及文本长度的影响;\n",
"2. 复杂的背景和干扰;文本检测受到图像失真,模糊,低分辨率,阴影,亮度等因素的影响;\n",
"3. 文本密集甚至重叠会影响文字的检测;\n",
"4. 文字存在局部一致性文本行的一小部分,也可视为是独立的文本\n",
"4. 文字存在局部一致性文本行的一小部分,也可视为是独立的文本\n",
"\n",
"<center><img src=\"https://ai-studio-static-online.cdn.bcebos.com/072f208f2aff47e886cf2cf1378e23c648356686cf1349c799b42f662d8ced00\"\n",
"width=\"1000\" ></center>\n",
"\n",
"<br><center>图3 文本检测场景</center>\n",
"\n",
"针对以上问题,衍生了很多基于深度学习的文本检测算法,解决自然场景文字检测问题这些方法可以分为基于回归和基于分割的文本检测方法。\n",
"针对以上问题,衍生了很多基于深度学习的文本检测算法,用于解决自然场景文字检测问题这些方法可以分为基于回归和基于分割的文本检测方法。\n",
"\n",
"下一节将简要介绍基于深度学习技术的经典文字检测算法。"
]
......@@ -59,7 +59,7 @@
"collapsed": false
},
"source": [
"## 2. 文本检测方法介绍\n",
"## 2 文本检测方法介绍\n",
"\n",
"\n",
"近些年来基于深度学习的文本检测算法层出不穷,这些方法大致可以分为两类:\n",
......@@ -134,7 +134,7 @@
"\n",
"\n",
"\n",
"LOMO[19]针对长文本和弯曲文本问题,提出迭代的优化文本定位特征获取更精细的文本定位该方法包括三个部分坐标回归模块DR,迭代优化模块IRM以及任意形状表达模块SEM。分别用于生成文本大致区域,迭代优化文本定位特征,预测文本区域、文本中心线以及文本边界。迭代的优化文本特征可以更好的解决长文本定位问题以及获得更精确的文本区域定位。\n",
"LOMO[19]针对长文本和弯曲文本问题,提出迭代的优化文本定位特征获取更精细的文本定位该方法包括三个部分坐标回归模块DR,迭代优化模块IRM以及任意形状表达模块SEM。它们分别用于生成文本大致区域,迭代优化文本定位特征,预测文本区域、文本中心线以及文本边界。迭代的优化文本特征可以更好的解决长文本定位问题以及获得更精确的文本区域定位。\n",
"<center><img src=\"https://ai-studio-static-online.cdn.bcebos.com/e90adf3ca25a45a0af0b84a181fbe2c4954be1fcca8f4049957128548b7131ef\"\n",
"width=\"1000\" ></center>\n",
"<br><center>图11 LOMO框架图</center>\n",
......@@ -228,7 +228,7 @@
"collapsed": false
},
"source": [
"## 3. 总结\n",
"## 3 总结\n",
"\n",
"本节介绍了近几年来文本检测领域的发展,包括基于回归、分割的文本检测方法,并分别列举并介绍了一些经典论文的方法思路。下一节以PaddleOCR开源库为例,详细介绍DBNet的算法原理以及核心代码实现。"
]
......
......@@ -13,13 +13,15 @@
"\n",
"通过本章的学习,你可以掌握:\n",
"\n",
"1. 如何使用paddleocr whl 包快速完成文本识别预测\n",
"1. 如何使用PaddleOCR whl包快速完成文本识别预测\n",
"\n",
"2. CRNN的基本原理和网络结构\n",
"\n",
"3. 模型训练的必须步骤和调参方式\n",
"\n",
"4. 使用自定义的数据集训练网络\n"
"4. 使用自定义的数据集训练网络\n",
"\n",
"注:`paddleocr`指代`PaddleOCR whl包`"
]
},
{
......@@ -189,7 +191,7 @@
"source": [
"# 安装 PaddlePaddle GPU 版本\n",
"!pip install paddlepaddle-gpu\n",
"# 安装 paddleocr whl包\n",
"# 安装 PaddleOCR whl包\n",
"! pip install -U pip\n",
"! pip install paddleocr"
]
......@@ -202,7 +204,7 @@
"source": [
"### 1.2 快速预测文字内容\n",
"\n",
"paddleocr whl包会自动下载ppocr轻量级模型作为默认模型\n",
"PaddleOCR whl包会自动下载ppocr轻量级模型作为默认模型\n",
"\n",
"下面展示如何使用whl包进行识别预测:\n",
"\n",
......@@ -326,13 +328,13 @@
"\n",
"<center><img src=https://ai-studio-static-online.cdn.bcebos.com/f6fae3ff66bd413fa182d75782034a2af6aab1994fa148a08e6565f3fb75b18d width=\"600\"></center>\n",
"\n",
"1)backbone:\n",
"1. backbone:\n",
"\n",
"卷积网络作为底层的骨干网络,用于从输入图像中提取特征序列。由于 `conv`、`max-pooling`、`elementwise` 和激活函数都作用在局部区域上,所以它们是平移不变的。因此,特征映射的每一列对应于原始图像的一个矩形区域(称为感受野),并且这些矩形区域与它们在特征映射上对应的列从左到右的顺序相同。由于CNN需要将输入的图像缩放到固定的尺寸以满足其固定的输入维数,因此它不适合长度变化很大的序列对象。为了更好的支持变长序列,CRNN将backbone最后一层输出的特征向量送到了RNN层,转换为序列特征。\n",
"\n",
"<center><img src=https://ai-studio-static-online.cdn.bcebos.com/6694818123724b0d92d05b63dc9dfb08c7ced6c47c3b4f4d9b110ae9ccfe941d width=\"600\"></center>\n",
"\n",
"2)neck: \n",
"2. neck: \n",
"\n",
"递归层,在卷积网络的基础上,构建递归网络,将图像特征转换为序列特征,预测每个帧的标签分布。\n",
"RNN具有很强的捕获序列上下文信息的能力。使用上下文线索进行基于图像的序列识别比单独处理每个像素更有效。以场景文本识别为例,宽字符可能需要几个连续的帧来充分描述。此外,有些歧义字符在观察其上下文时更容易区分。其次,RNN可以将误差差分反向传播回卷积层,使网络可以统一训练。第三,RNN能够对任意长度的序列进行操作,解决了文本图片变长的问题。CRNN使用双层LSTM作为递归层,解决了长序列训练过程中的梯度消失和梯度爆炸问题。\n",
......@@ -340,7 +342,7 @@
"<center><img src=https://ai-studio-static-online.cdn.bcebos.com/41cdb7fb08fb4b55923b0baf66b783e46fd063223d05416fa952369ad20ac83c width=\"600\"></center>\n",
"\n",
"\n",
"3)head: \n",
"3. head: \n",
"\n",
"转录层,通过全连接网络和softmax激活函数,将每帧的预测转换为最终的标签序列。最后使用 CTC Loss 在无需序列对齐的情况下,完成CNN和RNN的联合训练。CTC 有一套特别的合并序列机制,LSTM输出序列后,需要在时序上分类得到预测结果。可能存在多个时间步对应同一个类别,因此需要对相同结果进行合并。为避免合并本身存在的重复字符,CTC 引入了一个 `blank` 字符插入在重复字符之间。\n",
"\n",
......@@ -429,7 +431,7 @@
},
{
"data": {
"image/png": "\n",
"image/png": "",
"text/plain": [
"<Figure size 432x288 with 2 Axes>"
]
......@@ -464,7 +466,7 @@
"\n",
"* backbone\n",
"\n",
"PaddleOCR 使用 MobileNetV3 作为骨干网络,组网顺序与网络结构一致首先定义网络中的公共模块([源码位置](https://github.com/PaddlePaddle/PaddleOCR/blob/release/2.3/ppocr/modeling/backbones/rec_mobilenet_v3.py)):ConvBNLayerResidualUnitmake_divisible"
"PaddleOCR 使用 MobileNetV3 作为骨干网络,组网顺序与网络结构一致首先定义网络中的公共模块([源码位置](https://github.com/PaddlePaddle/PaddleOCR/blob/release/2.3/ppocr/modeling/backbones/rec_mobilenet_v3.py)):`ConvBNLayer`、`ResidualUnit`、`make_divisible`。"
]
},
{
......@@ -647,7 +649,7 @@
"collapsed": false
},
"source": [
"利用公共模块搭建骨干网络"
"利用公共模块搭建骨干网络"
]
},
{
......@@ -990,7 +992,7 @@
"source": [
"* neck\n",
"\n",
"neck 部分将backbone输出的视觉特征图转换为1维向量输入送到 LSTM 网络中,输出序列特征( [源码位置](https://github.com/PaddlePaddle/PaddleOCR/blob/release/2.3/ppocr/modeling/necks/rnn.py) ):"
"neck 部分将backbone输出的视觉特征图转换为1维向量输入送到 LSTM 网络中,输出序列特征([源码位置](https://github.com/PaddlePaddle/PaddleOCR/blob/release/2.3/ppocr/modeling/necks/rnn.py)):"
]
},
{
......@@ -1354,7 +1356,7 @@
"source": [
"确认配置文件中的数据路径是否正确,以 [rec_icdar15_train.yml](https://github.com/PaddlePaddle/PaddleOCR/blob/release/2.3/configs/rec/rec_icdar15_train.yml)为例:\n",
"\n",
"```\n",
"```yaml\n",
"Train:\n",
" dataset:\n",
" name: SimpleDataSet\n",
......@@ -1460,7 +1462,7 @@
"outputs": [
{
"data": {
"image/png": "\n",
"image/png": "",
"text/plain": [
"<Figure size 432x288 with 2 Axes>"
]
......@@ -1572,12 +1574,11 @@
"collapsed": false
},
"source": [
"实现完单条数据返回逻辑后,调用 `padde.io.Dataloader` 即可把数据组合成batch,具体可参考 [build_dataloader]()\n",
"\n",
"实现完单条数据返回逻辑后,调用 `padde.io.Dataloader` 即可把数据组合成batch,具体可参考 [build_dataloader](https://github.com/PaddlePaddle/PaddleOCR/blob/95c670faf6cf4551c841764cde43a4f4d9d5e634/ppocr/data/__init__.py#L52)。\n",
"\n",
"* build model\n",
"\n",
" build model 即搭建主要网络结构,具体细节如《2.3 代码实现》所述,本节不做过多介绍,各模块代码可参考[modeling](https://github.com/PaddlePaddle/PaddleOCR/tree/release/2.3/ppocr/modeling)\n",
" build model 即搭建主要网络结构,具体细节如《2.3 代码实现》所述,本节不做过多介绍,各模块代码可参考[modeling](https://github.com/PaddlePaddle/PaddleOCR/tree/release/2.4/ppocr/modeling)\n",
"\n",
"* build loss\n",
" \n",
......
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
......@@ -42,7 +42,7 @@
"\n",
"然后安装第三方库:\n",
"\n",
"```\n",
"```bash\n",
"cd PaddleOCR\n",
"pip3 install -r requirements.txt\n",
"```\n",
......
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"![](https://ai-studio-static-online.cdn.bcebos.com/72b2077605dd49b78f7f647d6821d10231f6bc52d7ed463da451a6a0bd1fc5ff)\n",
"*Note: The above pictures are from the Internet*\n",
"\n",
"# 1. OCR Technical Background\n",
"## 1.1 Application Scenarios of OCR Technology\n",
"\n",
"* **<font color=red>What is OCR</font>**\n",
"\n",
"OCR (Optical Character Recognition) is one of the key directions in computer vision. The traditional definition of OCR is generally oriented to scanned document objects. Now we often say OCR generally refers to scene text recognition (Scene Text Recognition, STR), mainly for natural scenes, such as plaques and other visible texts in various natural scenes as shown in the figure below.\n",
"\n",
"![](https://ai-studio-static-online.cdn.bcebos.com/c87c0e6f6c0a42cdbc552a4f973c1b0217c369194c1243558753896f3e66032c)\n",
"<center>Figure 1: Document scene text recognition VS. Natural scene text recognition</center>\n",
"\n",
"<br>\n",
"\n",
"* **<font color=red>What are the application scenarios of OCR? </font>**\n",
"\n",
"OCR technology has a wealth of application scenarios. A typical scenario is vertically-oriented structured text recognition widely used in daily life, such as license plate recognition, bank card information recognition, ID card information recognition, train ticket information recognition, and so on. The common feature of these small verticals is that the format is fixed. Therefore, it is very suitable to use OCR technology for automation, greatly reducing labor costs and improving.\n",
"\n",
"This vertically-oriented structured text recognition is currently the most widely used and relatively mature technology scene in OCR.\n",
"\n",
"![](https://ai-studio-static-online.cdn.bcebos.com/56e0df91d0d34443aacb17c9a1c5c186608ee675092648a693503df7fe45e535)\n",
"<center>Figure 2: Application scenarios of OCR technology</center>\n",
"\n",
"In addition to vertically-oriented structured text recognition, general OCR technology also has a wide range of applications and is often combined with other technologies to complete multi-modal tasks. For example, in video scenes, OCR technology is often used for subtitle automatic translation, content security monitoring, etc., Or combined with visual features to complete tasks such as video understanding and video search.\n",
"\n",
"![](https://ai-studio-static-online.cdn.bcebos.com/ca2341a51eb242ee8e1afe121ce3ebbc87a113cef1b643ed9bba92d0c8ee4f0f)\n",
"<center>Figure 3: General OCR in a multi-modal scene</center>\n",
"\n",
"## 1.2 OCR Technical Challenge\n",
"The technical difficulties of OCR can be divided into two aspects: the algorithm layer and the application layer.\n",
"\n",
"* **<font color=red>Algorithm layer</font>**\n",
"\n",
"The rich application scenarios of OCR determine that it will have many technical difficulties. Here are 8 common problems:\n",
"\n",
"![](https://ai-studio-static-online.cdn.bcebos.com/a56831fbf0c449fe9156a893002cadfe110ccfea835b4d90854a7ce4b1df2a4f)\n",
"<center>Figure 4: Technical difficulties of OCR algorithm layer</center>\n",
"\n",
"These problems bring huge technical challenges to both text detection and text recognition. It can be seen that these challenges are mainly oriented to natural scenes. At present, research in academia mainly focuses on natural scenes, and the commonly used academic datasets in the OCR field are also natural scenes. There are many studies on these issues. Relatively speaking, identification is more challenging than detection.\n",
"\n",
"* **<font color=red>Application layer</font>**\n",
"\n",
"In practical applications, especially in a wide range of general scenarios, in addition to the technical difficulties at the algorithm level such as affine transformation, scale problems, insufficient lighting, and shooting blur summarized in the previous section, OCR technology also faces two major difficulties:\n",
"1. **Massive data requires OCR to be able to process in real time.** OCR applications are often connected to massive data. Real-time processing of the data is required or hoped for. Real-time model speed is a big challenge.\n",
"2. **The end-side application requires that the OCR model is light enough and the recognition speed is fast enough.** OCR applications are often deployed on mobile terminals or embedded hardware. There are generally two modes for terminal-side OCR applications: upload to server vs. terminal-side direct recognition. Considering that the method of uploading to the server has requirements on the network, the real-time performance is low, and the server pressure is high when the request volume is too large, as well as the security of data transmission, we hope to complete the OCR identification directly on the terminal side. However, the storage space and computing power of the terminal side are limited, so there are high requirements for the size and prediction speed of the OCR model.\n",
"\n",
"![](https://ai-studio-static-online.cdn.bcebos.com/5bafdc3da1614c41a95ae39a2c36632f95e2893031a64929b9f49d4a4985cd2d)\n",
"<center>Figure 5: Technical difficulties of OCR application layer</center>"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# 2. OCR Cutting-edge Algorithm\n",
"\n",
"Although OCR is a relatively specific task, it involves many aspects of technology, including text detection, text recognition, end-to-end text recognition, document analysis, and so on. Academic research on various related technologies of OCR emerges endlessly. The following will briefly introduce the related work of several key technologies in the OCR task.\n",
"\n",
"## 2.1 Text Detection\n",
"\n",
"The task of text detection is to locate text regions in the input image. In recent years, research on text detection in academia has been very rich. A class of methods regard text detection as a specific scene in target detection, and improve and adapt based on general target detection algorithms. For example, TextBoxes[1] is based on one-stage target detector SSD. The algorithm [2] adjusts the target frame to fit text lines with extreme aspect ratios, while CTPN [3] is improved based on the Faster RCNN [4] architecture. However, there are still some differences between text detection and target detection in the target information and the task itself. For example, the text is generally larger in length and width, often in the shape of \"stripes\", and the text lines may be denser, curved text, etc. Therefore, many algorithms dedicated to text detection have been derived, such as EAST[5], PSENet[6], DBNet[7] and so on.\n",
"\n",
"<center><img src=\"https://ai-studio-static-online.cdn.bcebos.com/548b50212935402abb2e671c158c204737c2c64b9464442a8f65192c8a31b44d\" width=\"500\"></center>\n",
"<center>Figure 6: Example of text detection task</center>\n",
"\n",
"<br>\n",
"\n",
"At present, the more popular text detection algorithms can be roughly divided into two categories: **based on regression** and **based on segmentation**. There are also some algorithms that combine the two. Algorithms based on regression draw on general object detection algorithms, by setting the anchor regression detection frame, or directly doing pixel regression. This type of method has a better detection effect on regular-shaped text, but the detection effect on irregularly-shaped text will be relatively poor. For example, CTPN [3] has better detection effect on horizontal text, but poor detection effect on oblique and curved text. SegLink [8] is more effective for long text, but has limited effect on sparsely distributed text; algorithm based on segmentation Introduced Mask-RCNN [9], this type of algorithm can reach a higher level in various scenes and texts of various shapes, but the disadvantage is that the post-processing is generally more complicated, so there are often speed problems. And it cannot solve the problem of detecting overlapping text.\n",
"\n",
"<center><img src=\"https://ai-studio-static-online.cdn.bcebos.com/4f4ea65578384900909efff93d0b7386e86ece144d8c4677b7bc94b4f0337cfb\" width=\"800\"></center>\n",
"<center>Figure 7: Overview of text detection algorithms</center>\n",
"\n",
"<br>\n",
"\n",
"![](https://ai-studio-static-online.cdn.bcebos.com/473ba28cd0274d568f90eb8ca9e78864d994f3ebffe6419cb638e193c607b7b3)|![](https://ai-studio-static-online.cdn.bceb8dc)|![](https://ai-studio-static-online.cdn.bcebos.com/53b9e85ce46645c08481d7d7377720f5eea5ac30e37e4e9c9930e1f26b02e278)\n",
"|---|---|---|\n",
"<center>Figure 8: (left) CTPN[3] algorithm optimization based on regression anchor (middle) DB[7] algorithm optimization post-processing based on segmentation (right) SAST[10] algorithm of regression + segmentation</center>\n",
"\n",
"<br>\n",
"\n",
"The related technology of text detection will be interpreted and actual combat in detail in Chapter 2.\n",
"\n",
"## 2.2 Text Recognition\n",
"\n",
"The task of text recognition is to recognize the text content in the image, and the input generally comes from the text area of the image cut out by the text box obtained by text detection. Text recognition can generally be divided into two categories: **Regular Text Recognition** and **Irregular Text Recognition** according to the shape of the text to be recognized. Regular text mainly refers to printed fonts, scanned text, etc., and the text is roughly in the horizontal line position. Irregular text is often not in a horizontal position, and has problems such as bending, occlusion, and blurring. Irregular text scenes are very challenging, and it is also the main research direction in the field of text recognition.\n",
"\n",
"![](https://ai-studio-static-online.cdn.bcebos.com/b292f21e50c94debab7496d4ced96a93774a8525c12346f49cb151bde2a58fe8)\n",
"<center>Figure 9: (Left) Regular text VS. (Right) Irregular text</center>\n",
"\n",
"<br>\n",
"\n",
"The algorithm of regular text recognition can be roughly divided into two types based on CTC and Sequence2Sequence according to the different decoding methods. The processing methods of converting the sequence features learned by the network into the final recognition result are different. The algorithm based on CTC is represented by the classic CRNN [11].\n",
"\n",
"![](https://ai-studio-static-online.cdn.bcebos.com/403ca85c59d344f88d3b1229ca14b1e90c5c73c9f1d248b7aa94103f9d0af597)\n",
"<center>Figure 10: CTC-based recognition algorithm VS. Attention-based recognition algorithm</center>\n",
"\n",
"The recognition algorithms for irregular texts are more abundant. Methods such as STAR-Net [12] correct the irregular texts into regular rectangles by adding correction modules such as TPS before recognition. Attention-based methods such as RARE [13] enhance the attention to the correlation of parts between sequences. The segmentation-based method treats each character of a text line as an independent individual, and it is easier to recognize a single segmented character than to recognize the entire text line after correction. In addition, with the rapid development of Transformer [14] and its effectiveness in various tasks in recent years, a number of Transformer-based text recognition algorithms have also appeared. These methods use the transformer structure to solve the long-dependency modeling of CNN. The limitations of the problem, but also achieved good results.\n",
"\n",
"![](https://ai-studio-static-online.cdn.bcebos.com/0fa30c3789424473ad9be1c87a4f742c1db69e3defb64651906e5334ed9571a8)\n",
"<center>Figure 11: Recognition algorithm based on character segmentation [15]</center>\n",
"\n",
"<br>\n",
"\n",
"The related technologies of text recognition will be interpreted and actual combat in detail in Chapter 3.\n",
"\n",
"## 2.3 Document Structure Recognition\n",
"\n",
"OCR technology in the traditional sense can solve the detection and recognition needs of text. However, in practical application scenarios, structured information is often needed in the end, such as information formatting and extraction of ID cards and invoices, structured identification of tables, and so on. The application scenarios of OCR technology are mostly express document extraction, contract content comparison, financial factoring document information comparison, and logistics document identification. OCR result + post-processing is a commonly used structuring scheme, but the process is often complicated, and post-processing requires fine design and poor generalization. Under the background of the gradual maturity of OCR technology and the growing demand for structured information extraction, various technologies related to intelligent document analysis, such as layout analysis, table recognition, and key information extraction, have received more and more attention and research.\n",
"\n",
"* **Layout Analysis**\n",
"\n",
"Layout Analysis is mainly used to classify the content of document images. The categories can generally be divided into plain text, titles, tables, pictures, etc. Existing methods generally regard different plates in the document as different targets for detection or segmentation. For example, Soto Carlos [16], based on the target detection algorithm Faster R-CNN, combines context information and uses the inherent position information of the document content to improve the performance. Region detection performance. Sarkar Mausoom et al.[17] proposed a priori-based segmentation mechanism to train a document segmentation model on very high-resolution images, solving the problem that different structures in dense regions cannot be distinguished and merged due to excessive reduction of the original image.\n",
"\n",
"![](https://ai-studio-static-online.cdn.bcebos.com/dedb212e8972497998685ff51af7bfe03fdea57f6acd450281ad100807086e1a)\n",
"<center>Figure 12: Schematic diagram of layout analysis tasks</center>\n",
"\n",
"<br>\n",
"\n",
"* **Table Recognition**\n",
"\n",
"The task of table recognition is to identify and convert the table information in the document into an excel file. The types and styles of tables in text images are complex and diverse, such as different row and column combinations, different content text types, etc. In addition, the style of the document and the lighting environment during shooting have brought great challenges to table recognition. These challenges make table recognition always a research difficulty in the field of document understanding.\n",
"\n",
"![](https://ai-studio-static-online.cdn.bcebos.com/47119a2a2f9a45788390d6506f90d5de7449738008aa4c0ab619b18f37bd8d57)\n",
"![](https://ai-studio-static-online.cdn.bcebos.com/22ca5749441441e69dc0eaeb670832a5d0ae0ce522f34731be7d609a2d36e8c1)\n",
"<center>Figure 13: Schematic diagram of form recognition task</center>\n",
"\n",
"<br>\n",
"\n",
"There are many types of table recognition methods. The early traditional algorithms based on heuristic rules, such as the T-Rect algorithm proposed by Kieninger [18] and others, generally use manual design rules and connected domain detection and analysis. In recent years, with the development of deep learning, some CNN-based table structure recognition algorithms have emerged, such as DeepTabStR proposed by Siddiqui Shoaib Ahmed [19] and others, and TabStruct-Net proposed by Raja Sachin [20] and others. In addition, with the rise of *Graph Neural Network*, some researchers try to apply *Graph Neural Network* to the problem of table structure recognition. Based on the *Graph Neural Network*, table recognition is regarded as a graph reconstruction problem, such as Xue Wenyuan [21] TGRNet proposed by et al. The end-to-end method directly uses the network to complete the HTML representation output of the table structure. Most of the end-to-end methods use the Seq2Seq method to complete the prediction of the table structure, such as some methods based on Attention or Transformer, such as TableMaster [22].\n",
"\n",
"![](https://ai-studio-static-online.cdn.bcebos.com/a9a3c91898c84f03b382583859526c4b451ace862dbc4a15838f5dde4d0ea657)\n",
"<center>Figure 14: Schematic diagram of form identification method</center>\n",
"\n",
"<br>\n",
"\n",
"* **Key Information Extraction**\n",
"\n",
"Key Information Extraction (KIE) is an important task in Document VQA. It mainly extracts the key information needed from images, such as extracting name and citizen ID number information from ID cards. The types of such information are often It is fixed under a specific task, but is different between different tasks.\n",
"\n",
"![](https://ai-studio-static-online.cdn.bcebos.com/8af011647bb4464f80d07f3efeac469baed27c8185ef4c4883a19f40e8ba91f5)\n",
"<center>Figure 15: Schematic diagram of DocVQA tasks</center>\n",
"\n",
"<br>\n",
"\n",
"KIE is usually divided into two sub-tasks for research:\n",
"\n",
"- SER: Semantic Entity Recognition, to classify each detected text, such as dividing it into name and ID. As shown in the black box and red box in the figure below.\n",
"- RE: Relation Extraction, which classifies each detected text, such as dividing it into questions and answers. Then find the corresponding answer to each question. As shown in the figure below, the red and black boxes represent the question and the answer, respectively, and the yellow line represents the correspondence between the question and the answer.\n",
"\n",
"![](https://ai-studio-static-online.cdn.bcebos.com/2f1bc1a3e4a341ab9552bbf5f6c2be71ba78d7d65da64818b776efe0691e310b)\n",
"<center>Figure 16: ser and re tasks</center>\n",
"\n",
"<br>\n",
"\n",
"The general KIE method is researched based on Named Entity Recognition (NER) [4], but this type of method only uses the text information in the image and lacks the use of visual and structural information, so the accuracy is not high. On this basis, the methods in recent years have begun to merge visual and structural information with text information. According to the principles used when fusing multi-modal information, these methods can be divided into the following four types:\n",
"\n",
"- Grid-based method\n",
"- Token-based method\n",
"- GCN-based method\n",
"- Based on End to End method\n",
"\n",
"<br>\n",
"\n",
"Document analysis related technologies will be explained and actual combat in detail in Chapter 6.\n",
"\n",
"## 2.4 Other Related Technologies\n",
"\n",
"The previous mainly introduced three key technologies in the OCR field: text detection, text recognition, document structured recognition, and more other cutting-edge technologies related to OCR, including end-to-end text recognition, image preprocessing technology in OCR, and OCR data synthesis Etc., please refer to Chapter 7 and Chapter 8 of the tutorial.\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# 3. Industrial Practice of OCR Technology\n",
"\n",
"![](https://ai-studio-static-online.cdn.bcebos.com/3d5f18f7598f405884fa2fab041c95ce415af40712e9489996747f9d122c3d90)\n",
"\n",
"> You are Xiao Wang, what should I do?\n",
"> 1. I won't, I can't, I won't do it 😭\n",
"> 2. It is recommended that the boss find an outsourcing company or commercialization plan, anyway, spend the boss's money 😊\n",
"> 3. Find similar projects online, programming for Github😏\n",
"\n",
"<br>\n",
"\n",
"OCR technology will eventually fall into industrial practice. Although there is a lot of academic research on OCR technology, and the commercial application of OCR technology is relatively mature compared with other AI technologies, there are still some difficulties and challenges in actual industrial applications. The following will analyze from two perspectives of technology and industrial practice.\n",
"\n",
"\n",
"## 3.1 Difficulties in Industrial Practice\n",
"\n",
"In actual industrial practice, developers often need to rely on open source community resources to start or promote projects, and developers using open source models often face three major problems:\n",
"![](https://ai-studio-static-online.cdn.bcebos.com/7e5e79240b9c4f13b675b56bc12edf540f159c922bf24e3cbc4a0635a356c7f9)\n",
"<center>Figure 17: Three major problems in the practice of OCR technology industry</center>\n",
"\n",
"**1. Can't find & can't choose**\n",
"\n",
"The open source community is rich in resources, but information asymmetry makes developers unable to solve pain points efficiently. On the one hand, the open source community resources are too rich. Faced with a requirement, developers cannot quickly find a project that matches the business requirement from the massive code repository, that is, there is a problem of \"can't find\". On the other hand, when selecting algorithms, the indicators on the English public dataset cannot provide a direct reference for the Chinese scenarios that developers often face. Algorithm-by-algorithm verification takes a lot of time and manpower, and there is no guarantee that the most suitable algorithm will be selected, that is, \"can't choose\".\n",
"\n",
"**2. Not applicable to industry scenarios**\n",
"\n",
"The work in the open source community tends to focus more on effect optimization, such as open source or reproduction of academic paper codes, and generally focus more on algorithm effects. Compared with the work that balances the size and speed of the model, it is much less, and the model size and prediction are time-consuming Two indicators that cannot be ignored in industrial practice are as important as the model effect. Whether it is on the mobile side or the server side, the number of images to be recognized is often very large, and it is hoped that the model will be smaller, more accurate, and faster in prediction. GPU is too expensive, it is better to use CPU to run more economically. On the premise of meeting business needs, the lighter the model, the less resources it takes.\n",
"\n",
"**3. Difficult optimization and many training deployment problems**\n",
"\n",
"The direct use of open source algorithms or models generally cannot directly meet business needs. In actual business scenarios, OCR faces a variety of problems. The personalization of business scenarios often requires retraining of custom data sets. On existing open source projects, various optimizations are experimented. The cost of the method is higher. In addition, OCR application scenarios are very rich. There are a wide range of application requirements on the server and various mobile devices. The diversification of the hardware environment needs to support rich deployment methods. The open source community’s projects focus more on algorithms and models, and predict deployment. This part is obviously under-supported. To apply OCR technology from the algorithm in the paper to the application of technology, it has high requirements for the algorithm and engineering ability of the developer.\n",
"\n",
"## 3.2 Industrial OCR Development Kit PaddleOCR\n",
"\n",
"OCR industry practice requires a complete set of full-process solutions to speed up the research and development progress and save valuable research and development time. In other words, the ultra-lightweight model and its full-process solutions, especially for mobile terminals and embedded devices with limited computing power and storage space, can be said to be a rigid demand.\n",
"\n",
"In this context, the industrial-grade OCR development kit [PaddleOCR](https://github.com/PaddlePaddle/PaddleOCR) came into being.\n",
"\n",
"The construction idea of ​​PaddleOCR starts from user portraits and needs, and relies on the core framework of flying oars, selects and reproduces a wealth of cutting-edge algorithms, and develops PP characteristic models that are more suitable for industrial landing based on recurring algorithms, and integrates training and promotion to provide A variety of predictive deployment methods to meet different demand scenarios of actual applications.\n",
"\n",
"![](https://ai-studio-static-online.cdn.bcebos.com/e09929b4a31e44f9b5e3d542d12411332669d2e1a21d45ad88b1dd91142ec86c)\n",
"<center>Figure 18: Panorama of PaddleOCR development kit</center>\n",
"\n",
"<br>\n",
"\n",
"It can be seen from the panorama that PaddleOCR relies on the core framework of the flying paddle, and provides a wealth of solutions in the model algorithm, pre-training model library, industrial-grade deployment, etc., and provides data synthesis and semi-automatic data annotation tools to meet the needs of development Data production needs of the author.\n",
"\n",
"**At the model algorithm level**, PaddleOCR provides solutions for the two tasks of **text detection and recognition** and **document structure analysis** respectively. In terms of text detection and recognition, PaddleOCR has reproduced or open sourced 4 text detection algorithms, 8 text recognition algorithms, and 1 end-to-end text recognition algorithm. On this basis, a general text detection and recognition solution of the PP-OCR series is developed. In terms of document structure analysis, PaddleOCR provides algorithms such as layout analysis, table recognition, key information extraction, and named entity recognition, and based on this, it proposes a PP-Structure document analysis solution. A rich selection of algorithms can meet the needs of developers in different business scenarios. The unification of the code framework also facilitates the optimization and performance comparison of different algorithms for developers.\n",
"\n",
"**At the level of pre-training model library**, based on PP-OCR and PP-Structure solutions, PaddleOCR has developed and open-sourced PP series characteristic models suitable for industrial practice, including general-purpose, ultra-lightweight and multi-language text detection and recognition Models, and complex document analysis models. The PP series characteristic models are deeply optimized on the original algorithm, so that they can reach the practical level of the industry in terms of effect and performance. Developers can either directly apply to business scenarios or use business data for simple finetune. Easily develop a \"practical model\" suitable for your business needs.\n",
"\n",
"**At the industrial level of deployment**, PaddleOCR provides a server-side prediction solution based on Paddle Inference, a service-based deployment solution based on Paddle Serving, and an end-side deployment solution based on Paddle-Lite to meet the deployment needs of different hardware environments , At the same time, it provides a model compression scheme based on PaddleSlim, which can further compress the model size. The above deployment methods have completed the whole process of training and pushing to ensure that developers can deploy efficiently, stably and reliably.\n",
"\n",
"**At the data tool level**, PaddleOCR provides a semi-automatic data annotation tool PPOCRLabel and a data synthesis tool Style-Text to help developers more conveniently produce the data sets and annotation information required for model training. PPOCRLabel, as the industry's first open source semi-automatic OCR data annotation tool, is aimed at the tedious and tedious process of labeling, high mechanicality, manual labeling of a large amount of training data, and expensive time and money. The built-in PP-OCR model realizes pre-labeling + manual verification. The labeling mode can greatly improve labeling efficiency and save labor costs. The data synthesis tool Style-Text mainly solves the serious shortage of real data in actual scenes. Traditional synthesis algorithms cannot synthesize text styles (fonts, colors, spacing, background). Only a few target scene images are needed to synthesize a large number of target scene styles in batches. Similar text images.\n",
"\n",
"![](https://ai-studio-static-online.cdn.bcebos.com/90a358d6a62c49b7b8db47e18c77878c60f80cf9c81541bfa3befea68d9dbc0f)\n",
"<center>Figure 19: Schematic diagram of PPOCRLabel usage</center>\n",
"<br>\n",
"\n",
"![](https://ai-studio-static-online.cdn.bcebos.com/b63b10bc530c42bea3d3b923da6000f1cfef006d7eec4ff3bdc0439bd9c333c9)\n",
"<center>Figure 20: Example of Style-Text synthesis effect</center>\n",
"\n",
"<br>\n",
"\n",
"### 3.2.1 PP-OCR and PP-Structrue\n",
"\n",
"The PP series characteristic model is a model that is deeply optimized for the practical needs of the industry by various visual development kits of the flying propeller, striving for a balance between speed and accuracy. The PP series featured models in PaddleOCR include PP-OCR series models for text detection and recognition tasks and PP-Structure series models for document analysis.\n",
"\n",
"**(1) PP-OCR Chinese and English model**\n",
"\n",
"![](https://ai-studio-static-online.cdn.bcebos.com/3372558042044d43983b815069e1e43cb84432b993ed400f946976e75bd51f38)\n",
"![](https://ai-studio-static-online.cdn.bcebos.com/f0a0b936382c42dd8809e98759b4c84434d79386606b4d5b8a86416db6dbaeee)\n",
"<center>Figure 21: Example of PP-OCR model recognition results in Chinese and English</center>\n",
"\n",
"<br>\n",
"\n",
"The typical two-stage OCR algorithm adopted by the Chinese and English models of PP-OCR, that is, the composition method of detection model + recognition model, the specific algorithm framework is as follows:\n",
"![](https://ai-studio-static-online.cdn.bcebos.com/8af1371b5e3c486bb90a041903200c7c666c8bbc98c245dc802ff8c4da98617e)\n",
"<center>Figure 22: Schematic diagram of PP-OCR system pipeline</center>\n",
"\n",
"<br>\n",
"\n",
"It can be seen that in addition to input and output, the core framework of PP-OCR contains 3 modules, namely: text detection module, detection frame correction module, and text recognition module.\n",
"- Text detection module: The core is a text detection model trained on the [DB](https://arxiv.org/abs/1911.08947) detection algorithm to detect the text area in the image;\n",
"- Detection frame correction module: Input the detected text box into the detection frame correction module. At this stage, the text box represented by the four points is corrected into a rectangular frame, which is convenient for subsequent text recognition. On the other hand, the text direction will be judged and corrected. For example, if the text line is judged to be upside down, it will be corrected. This function is realized by training a text direction classifier;\n",
"- Text recognition module: Finally, the text recognition module performs text recognition on the corrected detection box to obtain the text content in each text box. The classic text recognition algorithm used in PP-OCR [CRNN](https://arxiv.org/abs/1507.05717).\n",
"\n",
"PaddleOCR has successively introduced PP-OCR[23] and PP-OCRv2[24] models.\n",
"\n",
"PP-OCR model is divided into mobile version (lightweight version) and server version (universal version). The mobile version model is mainly optimized based on the lightweight backbone network MobileNetV3. The optimized model (detection model + text direction classification model + recognition model) ) The size is only 8.1M, the average single image prediction on the CPU takes 350ms, and the T4 GPU is about 110ms. After cropping and quantization, it can be further compressed to 3.5M without changing the accuracy, which is convenient for end-side deployment. The previous test predicts that it will only take 260ms. For more PP-OCR evaluation data, please refer to [benchmark](https://github.com/PaddlePaddle/PaddleOCR/blob/release/2.2/doc/doc_ch/benchmark.md).\n",
"\n",
"PP-OCRv2 maintains the overall framework of PP-OCR, mainly for further strategic optimization of effects. The improvement includes 3 aspects:\n",
"- Compared with the PP-OCR mobile version, the model effect is improved by over 7%;\n",
"- In terms of speed, compared to the PP-OCR server version, it has increased by more than 220%;\n",
"- In terms of model size, with a total size of 11.6M, both server and mobile terminals can be easily deployed.\n",
"\n",
"The specific optimization strategies of PP-OCR and PP-OCRv2 will be explained in detail in Chapter 4.\n",
"\n",
"In addition to the Chinese and English models, PaddleOCR also trained and open-sourced English digital models and multi-language recognition models based on different data sets. All of the above are ultra-lightweight models and are suitable for different language scenarios.\n",
"![](https://ai-studio-static-online.cdn.bcebos.com/5978652a826647b98344cf61aa1c2027662af989b73e4a0e917d83718422eeb0)\n",
"![](https://ai-studio-static-online.cdn.bcebos.com/1a8a8e24b5a440d388dae767adf0ea9c049335b04e964abbb176f58c5b028d7e)\n",
"<center>Figure 23: Schematic diagram of the recognition effect of the English digital model and multilingual model of PP-OCR</center>\n",
"\n",
"<br>\n",
"\n",
"**(2) PP-Structure document analysis model**\n",
"\n",
"PP-Structure supports three subtasks: layout analysis, table recognition, and DocVQA.\n",
"\n",
"The core functions of PP-Structure are as follows:\n",
"- Support layout analysis of documents in the form of pictures, which can be divided into 5 types of areas: text, title, table, picture and list (used in conjunction with Layout-Parser)\n",
"- Support text, title, picture and list area to be extracted as text fields (used in conjunction with PP-OCR)\n",
"- Supports structured analysis in the table area, and the final result is output to an Excel file\n",
"- Support Python whl package and command line two ways, simple and easy to use\n",
"- Support custom training for two types of tasks: layout analysis and table structuring\n",
"- Support VQA tasks-SER and RE\n",
"\n",
"![](https://ai-studio-static-online.cdn.bcebos.com/129708c265644dbc90d6c8f7db224b3a6f11f37bb586463a82e7ccb50bcc2e76)\n",
"<center>Figure 24: Schematic diagram of PP-Structure system (this figure only contains layout analysis + table identification)</center>\n",
"\n",
"<br>\n",
"\n",
"The specific plan of PP-Structure will be explained in detail in Chapter 6.\n",
"\n",
"### 3.2.2 Industrial-grade Deployment Plan\n",
"\n",
"The flying paddle supports full-process and full-scene inference deployment. There are three main sources of models. The first one uses PaddlePaddle API to build a network structure for training. The second is based on the flying paddle kit series. The flying paddle kit provides a wealth of models. Library, simple and easy-to-use API, with out-of-the-box use, including visual model library PaddleCV, intelligent speech library PaddleSpeech and natural language processing library PaddleNLP, etc. The third type uses X2Paddle tools from third-party frameworks (PyTorh, ONNX, TensorFlow, etc.) The output model.\n",
"\n",
"The paddle model can be compressed, quantified, and distilled using PaddleSlim tools. It supports five deployment schemes, namely, servicing Paddle Serving, server/cloud Paddle Inference, mobile/edge Paddle Lite, web front end Paddle.js, and for Paddle. Unsupported hardware, such as MCU, Horizon, Kunyun and other domestic chips, can be converted into a third-party framework that supports ONNX with the help of Paddle2ONNX.\n",
"\n",
"![](https://ai-studio-static-online.cdn.bcebos.com/c9ffe78e7db14e4eb103e7f393a16fbf2ab438540250474a8e0e7adc4aeb7ee0)\n",
"<center>Figure 25: Flying propeller support deployment method</center>\n",
"\n",
"<br>\n",
"\n",
"Paddle Inference supports server-side and cloud deployment, with high performance and versatility. It is deeply adapted and optimized for different platforms and different application scenarios. Paddle Inference is the native reasoning library for flying paddles, ensuring that the model can be trained on the server side. Use, rapid deployment, suitable for high-performance hardware using multiple application language environments to deploy models with complex algorithms. The hardware covers x86 CPUs, Nvidia GPUs, and AI accelerators such as Baidu Kunlun XPU and Huawei Shengteng.\n",
"\n",
"Paddle Lite is an end-side inference engine with lightweight and high-performance features. It has been configured and optimized in-depth for end-side devices and various application scenarios. Currently supports multiple platforms such as Android, IOS, embedded Linux devices, macOS, etc. The hardware covers ARM CPU and GPU, X86 CPU and new hardware such as Baidu Kunlun, Huawei Ascend and Kirin, Rockchip, etc.\n",
"\n",
"Paddle Serving is a high-performance service framework designed to help users quickly deploy models in cloud services in a few steps. At present, Paddle Serving supports functions such as custom pre-processing, model combination, model hot load update, multi-machine multi-card multi-model, distributed reasoning, K8S deployment, security gateway and model encryption deployment, and support for multi-language and multi-client access. Paddle Serving The official also provides deployment examples of more than 40 models, including PaddleOCR, to help users get started faster.\n",
"\n",
"![](https://ai-studio-static-online.cdn.bcebos.com/4d8063d74194434ea9b7c9f81c7fbdfd2131e13770124d2e99c1b9670f12e019)\n",
"<center>Figure 26: Support deployment mode of flying propeller</center>\n",
"\n",
"<br>\n",
"\n",
"The above deployment plan will be explained in detail and actual combat based on the PP-OCRv2 model in Chapter 5."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# 4. Summary\n",
"\n",
"This section first introduces the application scenarios and cutting-edge algorithms of OCR technology, and then analyzes the difficulties and three major challenges of OCR technology in industrial practice.\n",
"\n",
"The contents of the subsequent chapters of this tutorial are arranged as follows:\n",
"\n",
"* The second and third chapters introduce detection and identification technology and practice respectively;\n",
"* Chapter 4 introduces PP-OCR optimization strategy;\n",
"* Chapter 5 Predicting and deploying actual combat;\n",
"* Chapter 6 introduces document structuring;\n",
"* Chapter 7 introduces other OCR-related algorithms such as end-to-end, data preprocessing, and data synthesis;\n",
"* Chapter 8 introduces OCR related data sets and data synthesis tools.\n",
"\n",
"# Reference\n",
"\n",
"[1] Liao, Minghui, et al. \"Textboxes: A fast text detector with a single deep neural network.\" Thirty-first AAAI conference on artificial intelligence. 2017.\n",
"\n",
"[2] Liu W, Anguelov D, Erhan D, et al. Ssd: Single shot multibox detector[C]//European conference on computer vision. Springer, Cham, 2016: 21-37.\n",
"\n",
"[3] Tian, Zhi, et al. \"Detecting text in natural image with connectionist text proposal network.\" European conference on computer vision. Springer, Cham, 2016.\n",
"\n",
"[4] Ren S, He K, Girshick R, et al. Faster r-cnn: Towards real-time object detection with region proposal networks[J]. Advances in neural information processing systems, 2015, 28: 91-99.\n",
"\n",
"[5] Zhou, Xinyu, et al. \"East: an efficient and accurate scene text detector.\" Proceedings of the IEEE conference on Computer Vision and Pattern Recognition. 2017.\n",
"\n",
"[6] Wang, Wenhai, et al. \"Shape robust text detection with progressive scale expansion network.\" Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition. 2019.\n",
"\n",
"[7] Liao, Minghui, et al. \"Real-time scene text detection with differentiable binarization.\" Proceedings of the AAAI Conference on Artificial Intelligence. Vol. 34. No. 07. 2020.\n",
"\n",
"[8] Deng, Dan, et al. \"Pixellink: Detecting scene text via instance segmentation.\" Proceedings of the AAAI Conference on Artificial Intelligence. Vol. 32. No. 1. 2018.\n",
"\n",
"[9] He K, Gkioxari G, Dollár P, et al. Mask r-cnn[C]//Proceedings of the IEEE international conference on computer vision. 2017: 2961-2969.\n",
"\n",
"[10] Wang P, Zhang C, Qi F, et al. A single-shot arbitrarily-shaped text detector based on context attended multi-task \n",
"learning[C]//Proceedings of the 27th ACM international conference on multimedia. 2019: 1277-1285.\n",
"\n",
"[11] Shi, B., Bai, X., & Yao, C. (2016). An end-to-end trainable neural network for image-based sequence recognition and its application to scene text recognition. IEEE transactions on pattern analysis and machine intelligence, 39(11), 2298-2304.\n",
"\n",
"[12] Star-Net Max Jaderberg, Karen Simonyan, Andrew Zisserman, et al. Spa- tial transformer networks. In Advances in neural information processing systems, pages 2017–2025, 2015.\n",
"\n",
"[13] Shi, B., Wang, X., Lyu, P., Yao, C., & Bai, X. (2016). Robust scene text recognition with automatic rectification. In Proceedings of the IEEE conference on computer vision and pattern recognition (pp. 4168-4176).\n",
"\n",
"[14] Sheng, F., Chen, Z., & Xu, B. (2019, September). NRTR: A no-recurrence sequence-to-sequence model for scene text recognition. In 2019 International Conference on Document Analysis and Recognition (ICDAR) (pp. 781-786). IEEE.\n",
"\n",
"[15] Lyu P, Liao M, Yao C, et al. Mask textspotter: An end-to-end trainable neural network for spotting text with arbitrary shapes[C]//Proceedings of the European Conference on Computer Vision (ECCV). 2018: 67-83.\n",
"\n",
"[16] Soto C, Yoo S. Visual detection with context for document layout analysis[C]//Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing and the 9th International Joint Conference on Natural Language Processing (EMNLP-IJCNLP). 2019: 3464-3470.\n",
"\n",
"[17] Sarkar M, Aggarwal M, Jain A, et al. Document Structure Extraction using Prior based High Resolution Hierarchical Semantic Segmentation[C]//European Conference on Computer Vision. Springer, Cham, 2020: 649-666.\n",
"\n",
"[18] Kieninger T, Dengel A. A paper-to-HTML table converting system[C]//Proceedings of document analysis systems (DAS). 1998, 98: 356-365.\n",
"\n",
"[19] Siddiqui S A, Fateh I A, Rizvi S T R, et al. Deeptabstr: Deep learning based table structure recognition[C]//2019 International Conference on Document Analysis and Recognition (ICDAR). IEEE, 2019: 1403-1409.\n",
"\n",
"[20] Raja S, Mondal A, Jawahar C V. Table structure recognition using top-down and bottom-up cues[C]//European Conference on Computer Vision. Springer, Cham, 2020: 70-86.\n",
"\n",
"[21] Xue W, Yu B, Wang W, et al. TGRNet: A Table Graph Reconstruction Network for Table Structure Recognition[J]. arXiv preprint arXiv:2106.10598, 2021.\n",
"\n",
"[22] Ye J, Qi X, He Y, et al. PingAn-VCGroup's Solution for ICDAR 2021 Competition on Scientific Literature Parsing Task B: Table Recognition to HTML[J]. arXiv preprint arXiv:2105.01848, 2021.\n",
"\n",
"[23] Du Y, Li C, Guo R, et al. PP-OCR: A practical ultra lightweight OCR system[J]. arXiv preprint arXiv:2009.09941, 2020.\n",
"\n",
"[24] Du Y, Li C, Guo R, et al. PP-OCRv2: Bag of Tricks for Ultra Lightweight OCR System[J]. arXiv preprint arXiv:2109.03144, 2021.\n",
"\n"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "py35-paddle1.2.0"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.4"
}
},
"nbformat": 4,
"nbformat_minor": 4
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment