diff --git a/magic_pdf/__init__.py b/magic_pdf/__init__.py deleted file mode 100644 index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000 diff --git a/magic_pdf/config/__init__.py b/magic_pdf/config/__init__.py deleted file mode 100644 index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000 diff --git a/magic_pdf/config/constants.py b/magic_pdf/config/constants.py deleted file mode 100644 index b18d630bbd1144816c4e0102c7f5ad53bc6d6194..0000000000000000000000000000000000000000 --- a/magic_pdf/config/constants.py +++ /dev/null @@ -1,60 +0,0 @@ -"""span维度自定义字段.""" -# span是否是跨页合并的 -CROSS_PAGE = 'cross_page' - -""" -block维度自定义字段 -""" -# block中lines是否被删除 -LINES_DELETED = 'lines_deleted' - -# table recognition max time default value -TABLE_MAX_TIME_VALUE = 400 - -# pp_table_result_max_length -TABLE_MAX_LEN = 480 - -# table master structure dict -TABLE_MASTER_DICT = 'table_master_structure_dict.txt' - -# table master dir -TABLE_MASTER_DIR = 'table_structure_tablemaster_infer/' - -# pp detect model dir -DETECT_MODEL_DIR = 'ch_PP-OCRv4_det_infer' - -# pp rec model dir -REC_MODEL_DIR = 'ch_PP-OCRv4_rec_infer' - -# pp rec char dict path -REC_CHAR_DICT = 'ppocr_keys_v1.txt' - -# pp rec copy rec directory -PP_REC_DIRECTORY = '.paddleocr/whl/rec/ch/ch_PP-OCRv4_rec_infer' - -# pp rec copy det directory -PP_DET_DIRECTORY = '.paddleocr/whl/det/ch/ch_PP-OCRv4_det_infer' - - -class MODEL_NAME: - # pp table structure algorithm - TABLE_MASTER = 'tablemaster' - # struct eqtable - STRUCT_EQTABLE = 'struct_eqtable' - - DocLayout_YOLO = 'doclayout_yolo' - - LAYOUTLMv3 = 'layoutlmv3' - - YOLO_V8_MFD = 'yolo_v8_mfd' - - UniMerNet_v2_Small = 'unimernet_small' - - RAPID_TABLE = 'rapid_table' - - YOLO_V11_LangDetect = 'yolo_v11n_langdetect' - - -PARSE_TYPE_TXT = 'txt' -PARSE_TYPE_OCR = 'ocr' - diff --git a/magic_pdf/config/drop_reason.py b/magic_pdf/config/drop_reason.py deleted file mode 100644 index d75d5676b81481c987f6c4d4948aaa82e9a4c86f..0000000000000000000000000000000000000000 --- a/magic_pdf/config/drop_reason.py +++ /dev/null @@ -1,35 +0,0 @@ -class DropReason: - TEXT_BLCOK_HOR_OVERLAP = 'text_block_horizontal_overlap' # 文字块有水平互相覆盖,导致无法准确定位文字顺序 - USEFUL_BLOCK_HOR_OVERLAP = ( - 'useful_block_horizontal_overlap' # 需保留的block水平覆盖 - ) - COMPLICATED_LAYOUT = 'complicated_layout' # 复杂的布局,暂时不支持 - TOO_MANY_LAYOUT_COLUMNS = 'too_many_layout_columns' # 目前不支持分栏超过2列的 - COLOR_BACKGROUND_TEXT_BOX = 'color_background_text_box' # 含有带色块的PDF,色块会改变阅读顺序,目前不支持带底色文字块的PDF。 - HIGH_COMPUTATIONAL_lOAD_BY_IMGS = ( - 'high_computational_load_by_imgs' # 含特殊图片,计算量太大,从而丢弃 - ) - HIGH_COMPUTATIONAL_lOAD_BY_SVGS = ( - 'high_computational_load_by_svgs' # 特殊的SVG图,计算量太大,从而丢弃 - ) - HIGH_COMPUTATIONAL_lOAD_BY_TOTAL_PAGES = 'high_computational_load_by_total_pages' # 计算量超过负荷,当前方法下计算量消耗过大 - MISS_DOC_LAYOUT_RESULT = 'missing doc_layout_result' # 版面分析失败 - Exception = '_exception' # 解析中发生异常 - ENCRYPTED = 'encrypted' # PDF是加密的 - EMPTY_PDF = 'total_page=0' # PDF页面总数为0 - NOT_IS_TEXT_PDF = 'not_is_text_pdf' # 不是文字版PDF,无法直接解析 - DENSE_SINGLE_LINE_BLOCK = 'dense_single_line_block' # 无法清晰的分段 - TITLE_DETECTION_FAILED = 'title_detection_failed' # 探测标题失败 - TITLE_LEVEL_FAILED = ( - 'title_level_failed' # 分析标题级别失败(例如一级、二级、三级标题) - ) - PARA_SPLIT_FAILED = 'para_split_failed' # 识别段落失败 - PARA_MERGE_FAILED = 'para_merge_failed' # 段落合并失败 - NOT_ALLOW_LANGUAGE = 'not_allow_language' # 不支持的语种 - SPECIAL_PDF = 'special_pdf' - PSEUDO_SINGLE_COLUMN = 'pseudo_single_column' # 无法精确判断文字分栏 - CAN_NOT_DETECT_PAGE_LAYOUT = 'can_not_detect_page_layout' # 无法分析页面的版面 - NEGATIVE_BBOX_AREA = 'negative_bbox_area' # 缩放导致 bbox 面积为负 - OVERLAP_BLOCKS_CAN_NOT_SEPARATION = ( - 'overlap_blocks_can_t_separation' # 无法分离重叠的block - ) diff --git a/magic_pdf/config/drop_tag.py b/magic_pdf/config/drop_tag.py deleted file mode 100644 index 51a2bc99378ddb1182a3c87de4e3623f00f93807..0000000000000000000000000000000000000000 --- a/magic_pdf/config/drop_tag.py +++ /dev/null @@ -1,19 +0,0 @@ - -COLOR_BG_HEADER_TXT_BLOCK = 'color_background_header_txt_block' -PAGE_NO = 'page-no' # 页码 -CONTENT_IN_FOOT_OR_HEADER = 'in-foot-header-area' # 页眉页脚内的文本 -VERTICAL_TEXT = 'vertical-text' # 垂直文本 -ROTATE_TEXT = 'rotate-text' # 旋转文本 -EMPTY_SIDE_BLOCK = 'empty-side-block' # 边缘上的空白没有任何内容的block -ON_IMAGE_TEXT = 'on-image-text' # 文本在图片上 -ON_TABLE_TEXT = 'on-table-text' # 文本在表格上 - - -class DropTag: - PAGE_NUMBER = 'page_no' - HEADER = 'header' - FOOTER = 'footer' - FOOTNOTE = 'footnote' - NOT_IN_LAYOUT = 'not_in_layout' - SPAN_OVERLAP = 'span_overlap' - BLOCK_OVERLAP = 'block_overlap' diff --git a/magic_pdf/config/enums.py b/magic_pdf/config/enums.py deleted file mode 100644 index 6f3e91a3227e6cb6678af0fc578a833a3d2439e3..0000000000000000000000000000000000000000 --- a/magic_pdf/config/enums.py +++ /dev/null @@ -1,7 +0,0 @@ - -import enum - - -class SupportedPdfParseMethod(enum.Enum): - OCR = 'ocr' - TXT = 'txt' diff --git a/magic_pdf/config/exceptions.py b/magic_pdf/config/exceptions.py deleted file mode 100644 index c0b7beda3409df0daaf63aac254f337186bc2999..0000000000000000000000000000000000000000 --- a/magic_pdf/config/exceptions.py +++ /dev/null @@ -1,39 +0,0 @@ - -class FileNotExisted(Exception): - - def __init__(self, path): - self.path = path - - def __str__(self): - return f'File {self.path} does not exist.' - - -class InvalidConfig(Exception): - def __init__(self, msg): - self.msg = msg - - def __str__(self): - return f'Invalid config: {self.msg}' - - -class InvalidParams(Exception): - def __init__(self, msg): - self.msg = msg - - def __str__(self): - return f'Invalid params: {self.msg}' - - -class EmptyData(Exception): - def __init__(self, msg): - self.msg = msg - - def __str__(self): - return f'Empty data: {self.msg}' - -class CUDA_NOT_AVAILABLE(Exception): - def __init__(self, msg): - self.msg = msg - - def __str__(self): - return f'CUDA not available: {self.msg}' \ No newline at end of file diff --git a/magic_pdf/config/make_content_config.py b/magic_pdf/config/make_content_config.py deleted file mode 100644 index abcd74a4b860f163deb484ad33797c638034fb08..0000000000000000000000000000000000000000 --- a/magic_pdf/config/make_content_config.py +++ /dev/null @@ -1,11 +0,0 @@ -class MakeMode: - MM_MD = 'mm_markdown' - NLP_MD = 'nlp_markdown' - STANDARD_FORMAT = 'standard_format' - - -class DropMode: - WHOLE_PDF = 'whole_pdf' - SINGLE_PAGE = 'single_page' - NONE = 'none' - NONE_WITH_REASON = 'none_with_reason' diff --git a/magic_pdf/config/model_block_type.py b/magic_pdf/config/model_block_type.py deleted file mode 100644 index 4ad739ac51c08071626d8badd17f43b0eb90a66c..0000000000000000000000000000000000000000 --- a/magic_pdf/config/model_block_type.py +++ /dev/null @@ -1,10 +0,0 @@ -from enum import Enum - - -class ModelBlockTypeEnum(Enum): - TITLE = 0 - PLAIN_TEXT = 1 - ABANDON = 2 - ISOLATE_FORMULA = 8 - EMBEDDING = 13 - ISOLATED = 14 diff --git a/magic_pdf/config/ocr_content_type.py b/magic_pdf/config/ocr_content_type.py deleted file mode 100644 index 30d88cfdedbf28d3552a92e1549b839bea195f5b..0000000000000000000000000000000000000000 --- a/magic_pdf/config/ocr_content_type.py +++ /dev/null @@ -1,40 +0,0 @@ -class ContentType: - Image = 'image' - Table = 'table' - Text = 'text' - InlineEquation = 'inline_equation' - InterlineEquation = 'interline_equation' - - -class BlockType: - Image = 'image' - ImageBody = 'image_body' - ImageCaption = 'image_caption' - ImageFootnote = 'image_footnote' - Table = 'table' - TableBody = 'table_body' - TableCaption = 'table_caption' - TableFootnote = 'table_footnote' - Text = 'text' - Title = 'title' - InterlineEquation = 'interline_equation' - Footnote = 'footnote' - Discarded = 'discarded' - List = 'list' - Index = 'index' - - -class CategoryId: - Title = 0 - Text = 1 - Abandon = 2 - ImageBody = 3 - ImageCaption = 4 - TableBody = 5 - TableCaption = 6 - TableFootnote = 7 - InterlineEquation_Layout = 8 - InlineEquation = 13 - InterlineEquation_YOLO = 14 - OcrText = 15 - ImageFootnote = 101 diff --git a/magic_pdf/data/__init__.py b/magic_pdf/data/__init__.py deleted file mode 100644 index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000 diff --git a/magic_pdf/data/batch_build_dataset.py b/magic_pdf/data/batch_build_dataset.py deleted file mode 100644 index 52d33485f1c9a624b31e16029526f0ac653a165f..0000000000000000000000000000000000000000 --- a/magic_pdf/data/batch_build_dataset.py +++ /dev/null @@ -1,167 +0,0 @@ -import concurrent.futures - -import fitz - -from magic_pdf.data.dataset import PymuDocDataset -from magic_pdf.data.utils import fitz_doc_to_image # PyMuPDF - - -def partition_array_greedy(arr, k): - """Partition an array into k parts using a simple greedy approach. - - Parameters: - ----------- - arr : list - The input array of integers - k : int - Number of partitions to create - - Returns: - -------- - partitions : list of lists - The k partitions of the array - """ - # Handle edge cases - if k <= 0: - raise ValueError('k must be a positive integer') - if k > len(arr): - k = len(arr) # Adjust k if it's too large - if k == 1: - return [list(range(len(arr)))] - if k == len(arr): - return [[i] for i in range(len(arr))] - - # Sort the array in descending order - sorted_indices = sorted(range(len(arr)), key=lambda i: arr[i][1], reverse=True) - - # Initialize k empty partitions - partitions = [[] for _ in range(k)] - partition_sums = [0] * k - - # Assign each element to the partition with the smallest current sum - for idx in sorted_indices: - # Find the partition with the smallest sum - min_sum_idx = partition_sums.index(min(partition_sums)) - - # Add the element to this partition - partitions[min_sum_idx].append(idx) # Store the original index - partition_sums[min_sum_idx] += arr[idx][1] - - return partitions - - -def process_pdf_batch(pdf_jobs, idx): - """Process a batch of PDF pages using multiple threads. - - Parameters: - ----------- - pdf_jobs : list of tuples - List of (pdf_path, page_num) tuples - output_dir : str or None - Directory to save images to - num_threads : int - Number of threads to use - **kwargs : - Additional arguments for process_pdf_page - - Returns: - -------- - images : list - List of processed images - """ - images = [] - - for pdf_path, _ in pdf_jobs: - doc = fitz.open(pdf_path) - tmp = [] - for page_num in range(len(doc)): - page = doc[page_num] - tmp.append(fitz_doc_to_image(page)) - images.append(tmp) - return (idx, images) - - -def batch_build_dataset(pdf_paths, k, lang=None): - """Process multiple PDFs by partitioning them into k balanced parts and - processing each part in parallel. - - Parameters: - ----------- - pdf_paths : list - List of paths to PDF files - k : int - Number of partitions to create - output_dir : str or None - Directory to save images to - threads_per_worker : int - Number of threads to use per worker - **kwargs : - Additional arguments for process_pdf_page - - Returns: - -------- - all_images : list - List of all processed images - """ - - results = [] - for pdf_path in pdf_paths: - with open(pdf_path, 'rb') as f: - pdf_bytes = f.read() - dataset = PymuDocDataset(pdf_bytes, lang=lang) - results.append(dataset) - return results - - - # - # # Get page counts for each PDF - # pdf_info = [] - # total_pages = 0 - # - # for pdf_path in pdf_paths: - # try: - # doc = fitz.open(pdf_path) - # num_pages = len(doc) - # pdf_info.append((pdf_path, num_pages)) - # total_pages += num_pages - # doc.close() - # except Exception as e: - # print(f'Error opening {pdf_path}: {e}') - # - # # Partition the jobs based on page countEach job has 1 page - # partitions = partition_array_greedy(pdf_info, k) - # - # # Process each partition in parallel - # all_images_h = {} - # - # with concurrent.futures.ProcessPoolExecutor(max_workers=k) as executor: - # # Submit one task per partition - # futures = [] - # for sn, partition in enumerate(partitions): - # # Get the jobs for this partition - # partition_jobs = [pdf_info[idx] for idx in partition] - # - # # Submit the task - # future = executor.submit( - # process_pdf_batch, - # partition_jobs, - # sn - # ) - # futures.append(future) - # # Process results as they complete - # for i, future in enumerate(concurrent.futures.as_completed(futures)): - # try: - # idx, images = future.result() - # all_images_h[idx] = images - # except Exception as e: - # print(f'Error processing partition: {e}') - # results = [None] * len(pdf_paths) - # for i in range(len(partitions)): - # partition = partitions[i] - # for j in range(len(partition)): - # with open(pdf_info[partition[j]][0], 'rb') as f: - # pdf_bytes = f.read() - # dataset = PymuDocDataset(pdf_bytes, lang=lang) - # dataset.set_images(all_images_h[i][j]) - # results[partition[j]] = dataset - # return results \ No newline at end of file diff --git a/magic_pdf/data/data_reader_writer/__init__.py b/magic_pdf/data/data_reader_writer/__init__.py deleted file mode 100644 index f8f8234739e4cc756b56dbd4cb502893481a7a09..0000000000000000000000000000000000000000 --- a/magic_pdf/data/data_reader_writer/__init__.py +++ /dev/null @@ -1,12 +0,0 @@ -from magic_pdf.data.data_reader_writer.filebase import \ - FileBasedDataReader # noqa: F401 -from magic_pdf.data.data_reader_writer.filebase import \ - FileBasedDataWriter # noqa: F401 -from magic_pdf.data.data_reader_writer.multi_bucket_s3 import \ - MultiBucketS3DataReader # noqa: F401 -from magic_pdf.data.data_reader_writer.multi_bucket_s3 import \ - MultiBucketS3DataWriter # noqa: F401 -from magic_pdf.data.data_reader_writer.s3 import S3DataReader # noqa: F401 -from magic_pdf.data.data_reader_writer.s3 import S3DataWriter # noqa: F401 -from magic_pdf.data.data_reader_writer.base import DataReader # noqa: F401 -from magic_pdf.data.data_reader_writer.base import DataWriter # noqa: F401 \ No newline at end of file diff --git a/magic_pdf/data/data_reader_writer/base.py b/magic_pdf/data/data_reader_writer/base.py deleted file mode 100644 index d294b329559723303b1f42cb9f48c39f07ae3622..0000000000000000000000000000000000000000 --- a/magic_pdf/data/data_reader_writer/base.py +++ /dev/null @@ -1,63 +0,0 @@ - -from abc import ABC, abstractmethod - - -class DataReader(ABC): - - def read(self, path: str) -> bytes: - """Read the file. - - Args: - path (str): file path to read - - Returns: - bytes: the content of the file - """ - return self.read_at(path) - - @abstractmethod - def read_at(self, path: str, offset: int = 0, limit: int = -1) -> bytes: - """Read the file at offset and limit. - - Args: - path (str): the file path - offset (int, optional): the number of bytes skipped. Defaults to 0. - limit (int, optional): the length of bytes want to read. Defaults to -1. - - Returns: - bytes: the content of the file - """ - pass - - -class DataWriter(ABC): - @abstractmethod - def write(self, path: str, data: bytes) -> None: - """Write the data to the file. - - Args: - path (str): the target file where to write - data (bytes): the data want to write - """ - pass - - def write_string(self, path: str, data: str) -> None: - """Write the data to file, the data will be encoded to bytes. - - Args: - path (str): the target file where to write - data (str): the data want to write - """ - - def safe_encode(data: str, method: str): - try: - bit_data = data.encode(encoding=method, errors='replace') - return bit_data, True - except: # noqa - return None, False - - for method in ['utf-8', 'ascii']: - bit_data, flag = safe_encode(data, method) - if flag: - self.write(path, bit_data) - break diff --git a/magic_pdf/data/data_reader_writer/filebase.py b/magic_pdf/data/data_reader_writer/filebase.py deleted file mode 100644 index ff098ea0826e207663a6f51a21a4214d951f91b4..0000000000000000000000000000000000000000 --- a/magic_pdf/data/data_reader_writer/filebase.py +++ /dev/null @@ -1,62 +0,0 @@ -import os - -from magic_pdf.data.data_reader_writer.base import DataReader, DataWriter - - -class FileBasedDataReader(DataReader): - def __init__(self, parent_dir: str = ''): - """Initialized with parent_dir. - - Args: - parent_dir (str, optional): the parent directory that may be used within methods. Defaults to ''. - """ - self._parent_dir = parent_dir - - def read_at(self, path: str, offset: int = 0, limit: int = -1) -> bytes: - """Read at offset and limit. - - Args: - path (str): the path of file, if the path is relative path, it will be joined with parent_dir. - offset (int, optional): the number of bytes skipped. Defaults to 0. - limit (int, optional): the length of bytes want to read. Defaults to -1. - - Returns: - bytes: the content of file - """ - fn_path = path - if not os.path.isabs(fn_path) and len(self._parent_dir) > 0: - fn_path = os.path.join(self._parent_dir, path) - - with open(fn_path, 'rb') as f: - f.seek(offset) - if limit == -1: - return f.read() - else: - return f.read(limit) - - -class FileBasedDataWriter(DataWriter): - def __init__(self, parent_dir: str = '') -> None: - """Initialized with parent_dir. - - Args: - parent_dir (str, optional): the parent directory that may be used within methods. Defaults to ''. - """ - self._parent_dir = parent_dir - - def write(self, path: str, data: bytes) -> None: - """Write file with data. - - Args: - path (str): the path of file, if the path is relative path, it will be joined with parent_dir. - data (bytes): the data want to write - """ - fn_path = path - if not os.path.isabs(fn_path) and len(self._parent_dir) > 0: - fn_path = os.path.join(self._parent_dir, path) - - if not os.path.exists(os.path.dirname(fn_path)) and os.path.dirname(fn_path) != "": - os.makedirs(os.path.dirname(fn_path), exist_ok=True) - - with open(fn_path, 'wb') as f: - f.write(data) diff --git a/magic_pdf/data/data_reader_writer/multi_bucket_s3.py b/magic_pdf/data/data_reader_writer/multi_bucket_s3.py deleted file mode 100644 index 525209f07db93a4dd0b8b4e17bd2a5ba2453c605..0000000000000000000000000000000000000000 --- a/magic_pdf/data/data_reader_writer/multi_bucket_s3.py +++ /dev/null @@ -1,145 +0,0 @@ - -from magic_pdf.config.exceptions import InvalidConfig, InvalidParams -from magic_pdf.data.data_reader_writer.base import DataReader, DataWriter -from magic_pdf.data.io.s3 import S3Reader, S3Writer -from magic_pdf.data.schemas import S3Config -from magic_pdf.libs.path_utils import (parse_s3_range_params, parse_s3path, - remove_non_official_s3_args) - - -class MultiS3Mixin: - def __init__(self, default_prefix: str, s3_configs: list[S3Config]): - """Initialized with multiple s3 configs. - - Args: - default_prefix (str): the default prefix of the relative path. for example, {some_bucket}/{some_prefix} or {some_bucket} - s3_configs (list[S3Config]): list of s3 configs, the bucket_name must be unique in the list. - - Raises: - InvalidConfig: default bucket config not in s3_configs. - InvalidConfig: bucket name not unique in s3_configs. - InvalidConfig: default bucket must be provided. - """ - if len(default_prefix) == 0: - raise InvalidConfig('default_prefix must be provided') - - arr = default_prefix.strip('/').split('/') - self.default_bucket = arr[0] - self.default_prefix = '/'.join(arr[1:]) - - found_default_bucket_config = False - for conf in s3_configs: - if conf.bucket_name == self.default_bucket: - found_default_bucket_config = True - break - - if not found_default_bucket_config: - raise InvalidConfig( - f'default_bucket: {self.default_bucket} config must be provided in s3_configs: {s3_configs}' - ) - - uniq_bucket = set([conf.bucket_name for conf in s3_configs]) - if len(uniq_bucket) != len(s3_configs): - raise InvalidConfig( - f'the bucket_name in s3_configs: {s3_configs} must be unique' - ) - - self.s3_configs = s3_configs - self._s3_clients_h: dict = {} - - -class MultiBucketS3DataReader(DataReader, MultiS3Mixin): - def read(self, path: str) -> bytes: - """Read the path from s3, select diffect bucket client for each request - based on the bucket, also support range read. - - Args: - path (str): the s3 path of file, the path must be in the format of s3://bucket_name/path?offset,limit. - for example: s3://bucket_name/path?0,100. - - Returns: - bytes: the content of s3 file. - """ - may_range_params = parse_s3_range_params(path) - if may_range_params is None or 2 != len(may_range_params): - byte_start, byte_len = 0, -1 - else: - byte_start, byte_len = int(may_range_params[0]), int(may_range_params[1]) - path = remove_non_official_s3_args(path) - return self.read_at(path, byte_start, byte_len) - - def __get_s3_client(self, bucket_name: str): - if bucket_name not in set([conf.bucket_name for conf in self.s3_configs]): - raise InvalidParams( - f'bucket name: {bucket_name} not found in s3_configs: {self.s3_configs}' - ) - if bucket_name not in self._s3_clients_h: - conf = next( - filter(lambda conf: conf.bucket_name == bucket_name, self.s3_configs) - ) - self._s3_clients_h[bucket_name] = S3Reader( - bucket_name, - conf.access_key, - conf.secret_key, - conf.endpoint_url, - conf.addressing_style, - ) - return self._s3_clients_h[bucket_name] - - def read_at(self, path: str, offset: int = 0, limit: int = -1) -> bytes: - """Read the file with offset and limit, select diffect bucket client - for each request based on the bucket. - - Args: - path (str): the file path. - offset (int, optional): the number of bytes skipped. Defaults to 0. - limit (int, optional): the number of bytes want to read. Defaults to -1 which means infinite. - - Returns: - bytes: the file content. - """ - if path.startswith('s3://'): - bucket_name, path = parse_s3path(path) - s3_reader = self.__get_s3_client(bucket_name) - else: - s3_reader = self.__get_s3_client(self.default_bucket) - if self.default_prefix: - path = self.default_prefix + '/' + path - return s3_reader.read_at(path, offset, limit) - - -class MultiBucketS3DataWriter(DataWriter, MultiS3Mixin): - def __get_s3_client(self, bucket_name: str): - if bucket_name not in set([conf.bucket_name for conf in self.s3_configs]): - raise InvalidParams( - f'bucket name: {bucket_name} not found in s3_configs: {self.s3_configs}' - ) - if bucket_name not in self._s3_clients_h: - conf = next( - filter(lambda conf: conf.bucket_name == bucket_name, self.s3_configs) - ) - self._s3_clients_h[bucket_name] = S3Writer( - bucket_name, - conf.access_key, - conf.secret_key, - conf.endpoint_url, - conf.addressing_style, - ) - return self._s3_clients_h[bucket_name] - - def write(self, path: str, data: bytes) -> None: - """Write file with data, also select diffect bucket client for each - request based on the bucket. - - Args: - path (str): the path of file, if the path is relative path, it will be joined with parent_dir. - data (bytes): the data want to write. - """ - if path.startswith('s3://'): - bucket_name, path = parse_s3path(path) - s3_writer = self.__get_s3_client(bucket_name) - else: - s3_writer = self.__get_s3_client(self.default_bucket) - if self.default_prefix: - path = self.default_prefix + '/' + path - return s3_writer.write(path, data) diff --git a/magic_pdf/data/data_reader_writer/s3.py b/magic_pdf/data/data_reader_writer/s3.py deleted file mode 100644 index 34ec43b7c1b52ef931a8b06febba12c11ac7ab82..0000000000000000000000000000000000000000 --- a/magic_pdf/data/data_reader_writer/s3.py +++ /dev/null @@ -1,73 +0,0 @@ -from magic_pdf.data.data_reader_writer.multi_bucket_s3 import ( - MultiBucketS3DataReader, MultiBucketS3DataWriter) -from magic_pdf.data.schemas import S3Config - - -class S3DataReader(MultiBucketS3DataReader): - def __init__( - self, - default_prefix_without_bucket: str, - bucket: str, - ak: str, - sk: str, - endpoint_url: str, - addressing_style: str = 'auto', - ): - """s3 reader client. - - Args: - default_prefix_without_bucket: prefix that not contains bucket - bucket (str): bucket name - ak (str): access key - sk (str): secret key - endpoint_url (str): endpoint url of s3 - addressing_style (str, optional): Defaults to 'auto'. Other valid options here are 'path' and 'virtual' - refer to https://boto3.amazonaws.com/v1/documentation/api/1.9.42/guide/s3.html - """ - super().__init__( - f'{bucket}/{default_prefix_without_bucket}', - [ - S3Config( - bucket_name=bucket, - access_key=ak, - secret_key=sk, - endpoint_url=endpoint_url, - addressing_style=addressing_style, - ) - ], - ) - - -class S3DataWriter(MultiBucketS3DataWriter): - def __init__( - self, - default_prefix_without_bucket: str, - bucket: str, - ak: str, - sk: str, - endpoint_url: str, - addressing_style: str = 'auto', - ): - """s3 writer client. - - Args: - default_prefix_without_bucket: prefix that not contains bucket - bucket (str): bucket name - ak (str): access key - sk (str): secret key - endpoint_url (str): endpoint url of s3 - addressing_style (str, optional): Defaults to 'auto'. Other valid options here are 'path' and 'virtual' - refer to https://boto3.amazonaws.com/v1/documentation/api/1.9.42/guide/s3.html - """ - super().__init__( - f'{bucket}/{default_prefix_without_bucket}', - [ - S3Config( - bucket_name=bucket, - access_key=ak, - secret_key=sk, - endpoint_url=endpoint_url, - addressing_style=addressing_style, - ) - ], - ) diff --git a/magic_pdf/data/dataset.py b/magic_pdf/data/dataset.py deleted file mode 100644 index fb626e12cbfb7845fff1fed30dbbfdf650d507ea..0000000000000000000000000000000000000000 --- a/magic_pdf/data/dataset.py +++ /dev/null @@ -1,408 +0,0 @@ -import os -from abc import ABC, abstractmethod -from typing import Callable, Iterator - -import fitz -from loguru import logger - -from magic_pdf.config.enums import SupportedPdfParseMethod -from magic_pdf.data.schemas import PageInfo -from magic_pdf.data.utils import fitz_doc_to_image -from magic_pdf.filter import classify - - -class PageableData(ABC): - @abstractmethod - def get_image(self) -> dict: - """Transform data to image.""" - pass - - @abstractmethod - def get_doc(self) -> fitz.Page: - """Get the pymudoc page.""" - pass - - @abstractmethod - def get_page_info(self) -> PageInfo: - """Get the page info of the page. - - Returns: - PageInfo: the page info of this page - """ - pass - - @abstractmethod - def draw_rect(self, rect_coords, color, fill, fill_opacity, width, overlay): - """draw rectangle. - - Args: - rect_coords (list[float]): four elements array contain the top-left and bottom-right coordinates, [x0, y0, x1, y1] - color (list[float] | None): three element tuple which describe the RGB of the board line, None means no board line - fill (list[float] | None): fill the board with RGB, None means will not fill with color - fill_opacity (float): opacity of the fill, range from [0, 1] - width (float): the width of board - overlay (bool): fill the color in foreground or background. True means fill in background. - """ - pass - - @abstractmethod - def insert_text(self, coord, content, fontsize, color): - """insert text. - - Args: - coord (list[float]): four elements array contain the top-left and bottom-right coordinates, [x0, y0, x1, y1] - content (str): the text content - fontsize (int): font size of the text - color (list[float] | None): three element tuple which describe the RGB of the board line, None will use the default font color! - """ - pass - - -class Dataset(ABC): - @abstractmethod - def __len__(self) -> int: - """The length of the dataset.""" - pass - - @abstractmethod - def __iter__(self) -> Iterator[PageableData]: - """Yield the page data.""" - pass - - @abstractmethod - def supported_methods(self) -> list[SupportedPdfParseMethod]: - """The methods that this dataset support. - - Returns: - list[SupportedPdfParseMethod]: The supported methods, Valid methods are: OCR, TXT - """ - pass - - @abstractmethod - def data_bits(self) -> bytes: - """The bits used to create this dataset.""" - pass - - @abstractmethod - def get_page(self, page_id: int) -> PageableData: - """Get the page indexed by page_id. - - Args: - page_id (int): the index of the page - - Returns: - PageableData: the page doc object - """ - pass - - @abstractmethod - def dump_to_file(self, file_path: str): - """Dump the file. - - Args: - file_path (str): the file path - """ - pass - - @abstractmethod - def apply(self, proc: Callable, *args, **kwargs): - """Apply callable method which. - - Args: - proc (Callable): invoke proc as follows: - proc(self, *args, **kwargs) - - Returns: - Any: return the result generated by proc - """ - pass - - @abstractmethod - def classify(self) -> SupportedPdfParseMethod: - """classify the dataset. - - Returns: - SupportedPdfParseMethod: _description_ - """ - pass - - @abstractmethod - def clone(self): - """clone this dataset.""" - pass - - -class PymuDocDataset(Dataset): - def __init__(self, bits: bytes, lang=None): - """Initialize the dataset, which wraps the pymudoc documents. - - Args: - bits (bytes): the bytes of the pdf - """ - self._raw_fitz = fitz.open('pdf', bits) - self._records = [Doc(v) for v in self._raw_fitz] - self._data_bits = bits - self._raw_data = bits - self._classify_result = None - - if lang == '': - self._lang = None - elif lang == 'auto': - from magic_pdf.model.sub_modules.language_detection.utils import \ - auto_detect_lang - self._lang = auto_detect_lang(self._data_bits) - logger.info(f'lang: {lang}, detect_lang: {self._lang}') - else: - self._lang = lang - logger.info(f'lang: {lang}') - - def __len__(self) -> int: - """The page number of the pdf.""" - return len(self._records) - - def __iter__(self) -> Iterator[PageableData]: - """Yield the page doc object.""" - return iter(self._records) - - def supported_methods(self) -> list[SupportedPdfParseMethod]: - """The method supported by this dataset. - - Returns: - list[SupportedPdfParseMethod]: the supported methods - """ - return [SupportedPdfParseMethod.OCR, SupportedPdfParseMethod.TXT] - - def data_bits(self) -> bytes: - """The pdf bits used to create this dataset.""" - return self._data_bits - - def get_page(self, page_id: int) -> PageableData: - """The page doc object. - - Args: - page_id (int): the page doc index - - Returns: - PageableData: the page doc object - """ - return self._records[page_id] - - def dump_to_file(self, file_path: str): - """Dump the file. - - Args: - file_path (str): the file path - """ - - dir_name = os.path.dirname(file_path) - if dir_name not in ('', '.', '..'): - os.makedirs(dir_name, exist_ok=True) - self._raw_fitz.save(file_path) - - def apply(self, proc: Callable, *args, **kwargs): - """Apply callable method which. - - Args: - proc (Callable): invoke proc as follows: - proc(dataset, *args, **kwargs) - - Returns: - Any: return the result generated by proc - """ - if 'lang' in kwargs and self._lang is not None: - kwargs['lang'] = self._lang - return proc(self, *args, **kwargs) - - def classify(self) -> SupportedPdfParseMethod: - """classify the dataset. - - Returns: - SupportedPdfParseMethod: _description_ - """ - if self._classify_result is None: - self._classify_result = classify(self._data_bits) - return self._classify_result - - def clone(self): - """clone this dataset.""" - return PymuDocDataset(self._raw_data) - - def set_images(self, images): - for i in range(len(self._records)): - self._records[i].set_image(images[i]) - -class ImageDataset(Dataset): - def __init__(self, bits: bytes, lang=None): - """Initialize the dataset, which wraps the pymudoc documents. - - Args: - bits (bytes): the bytes of the photo which will be converted to pdf first. then converted to pymudoc. - """ - pdf_bytes = fitz.open(stream=bits).convert_to_pdf() - self._raw_fitz = fitz.open('pdf', pdf_bytes) - self._records = [Doc(v) for v in self._raw_fitz] - self._raw_data = bits - self._data_bits = pdf_bytes - - if lang == '': - self._lang = None - elif lang == 'auto': - from magic_pdf.model.sub_modules.language_detection.utils import \ - auto_detect_lang - self._lang = auto_detect_lang(self._data_bits) - logger.info(f'lang: {lang}, detect_lang: {self._lang}') - else: - self._lang = lang - logger.info(f'lang: {lang}') - - def __len__(self) -> int: - """The length of the dataset.""" - return len(self._records) - - def __iter__(self) -> Iterator[PageableData]: - """Yield the page object.""" - return iter(self._records) - - def supported_methods(self): - """The method supported by this dataset. - - Returns: - list[SupportedPdfParseMethod]: the supported methods - """ - return [SupportedPdfParseMethod.OCR] - - def data_bits(self) -> bytes: - """The pdf bits used to create this dataset.""" - return self._data_bits - - def get_page(self, page_id: int) -> PageableData: - """The page doc object. - - Args: - page_id (int): the page doc index - - Returns: - PageableData: the page doc object - """ - return self._records[page_id] - - def dump_to_file(self, file_path: str): - """Dump the file. - - Args: - file_path (str): the file path - """ - dir_name = os.path.dirname(file_path) - if dir_name not in ('', '.', '..'): - os.makedirs(dir_name, exist_ok=True) - self._raw_fitz.save(file_path) - - def apply(self, proc: Callable, *args, **kwargs): - """Apply callable method which. - - Args: - proc (Callable): invoke proc as follows: - proc(dataset, *args, **kwargs) - - Returns: - Any: return the result generated by proc - """ - return proc(self, *args, **kwargs) - - def classify(self) -> SupportedPdfParseMethod: - """classify the dataset. - - Returns: - SupportedPdfParseMethod: _description_ - """ - return SupportedPdfParseMethod.OCR - - def clone(self): - """clone this dataset.""" - return ImageDataset(self._raw_data) - - def set_images(self, images): - for i in range(len(self._records)): - self._records[i].set_image(images[i]) - -class Doc(PageableData): - """Initialized with pymudoc object.""" - - def __init__(self, doc: fitz.Page): - self._doc = doc - self._img = None - - def get_image(self): - """Return the image info. - - Returns: - dict: { - img: np.ndarray, - width: int, - height: int - } - """ - if self._img is None: - self._img = fitz_doc_to_image(self._doc) - return self._img - - def set_image(self, img): - """ - Args: - img (np.ndarray): the image - """ - if self._img is None: - self._img = img - - def get_doc(self) -> fitz.Page: - """Get the pymudoc object. - - Returns: - fitz.Page: the pymudoc object - """ - return self._doc - - def get_page_info(self) -> PageInfo: - """Get the page info of the page. - - Returns: - PageInfo: the page info of this page - """ - page_w = self._doc.rect.width - page_h = self._doc.rect.height - return PageInfo(w=page_w, h=page_h) - - def __getattr__(self, name): - if hasattr(self._doc, name): - return getattr(self._doc, name) - - def draw_rect(self, rect_coords, color, fill, fill_opacity, width, overlay): - """draw rectangle. - - Args: - rect_coords (list[float]): four elements array contain the top-left and bottom-right coordinates, [x0, y0, x1, y1] - color (list[float] | None): three element tuple which describe the RGB of the board line, None means no board line - fill (list[float] | None): fill the board with RGB, None means will not fill with color - fill_opacity (float): opacity of the fill, range from [0, 1] - width (float): the width of board - overlay (bool): fill the color in foreground or background. True means fill in background. - """ - self._doc.draw_rect( - rect_coords, - color=color, - fill=fill, - fill_opacity=fill_opacity, - width=width, - overlay=overlay, - ) - - def insert_text(self, coord, content, fontsize, color): - """insert text. - - Args: - coord (list[float]): four elements array contain the top-left and bottom-right coordinates, [x0, y0, x1, y1] - content (str): the text content - fontsize (int): font size of the text - color (list[float] | None): three element tuple which describe the RGB of the board line, None will use the default font color! - """ - self._doc.insert_text(coord, content, fontsize=fontsize, color=color) \ No newline at end of file diff --git a/magic_pdf/data/io/__init__.py b/magic_pdf/data/io/__init__.py deleted file mode 100644 index badf1df07551df611dc955710743f26bf5f60595..0000000000000000000000000000000000000000 --- a/magic_pdf/data/io/__init__.py +++ /dev/null @@ -1,6 +0,0 @@ - -from magic_pdf.data.io.base import IOReader, IOWriter # noqa: F401 -from magic_pdf.data.io.http import HttpReader, HttpWriter # noqa: F401 -from magic_pdf.data.io.s3 import S3Reader, S3Writer # noqa: F401 - -__all__ = ['IOReader', 'IOWriter', 'HttpReader', 'HttpWriter', 'S3Reader', 'S3Writer'] \ No newline at end of file diff --git a/magic_pdf/data/io/base.py b/magic_pdf/data/io/base.py deleted file mode 100644 index 3c163d1fe97f9f40820fbd710f85a67bcccd4b34..0000000000000000000000000000000000000000 --- a/magic_pdf/data/io/base.py +++ /dev/null @@ -1,42 +0,0 @@ -from abc import ABC, abstractmethod - - -class IOReader(ABC): - @abstractmethod - def read(self, path: str) -> bytes: - """Read the file. - - Args: - path (str): file path to read - - Returns: - bytes: the content of the file - """ - pass - - @abstractmethod - def read_at(self, path: str, offset: int = 0, limit: int = -1) -> bytes: - """Read at offset and limit. - - Args: - path (str): the path of file, if the path is relative path, it will be joined with parent_dir. - offset (int, optional): the number of bytes skipped. Defaults to 0. - limit (int, optional): the length of bytes want to read. Defaults to -1. - - Returns: - bytes: the content of file - """ - pass - - -class IOWriter(ABC): - - @abstractmethod - def write(self, path: str, data: bytes) -> None: - """Write file with data. - - Args: - path (str): the path of file, if the path is relative path, it will be joined with parent_dir. - data (bytes): the data want to write - """ - pass diff --git a/magic_pdf/data/io/http.py b/magic_pdf/data/io/http.py deleted file mode 100644 index 3b08271f05a8ad6e2163f1e357fe66ab4a713b48..0000000000000000000000000000000000000000 --- a/magic_pdf/data/io/http.py +++ /dev/null @@ -1,37 +0,0 @@ - -import io - -import requests - -from magic_pdf.data.io.base import IOReader, IOWriter - - -class HttpReader(IOReader): - - def read(self, url: str) -> bytes: - """Read the file. - - Args: - path (str): file path to read - - Returns: - bytes: the content of the file - """ - return requests.get(url).content - - def read_at(self, path: str, offset: int = 0, limit: int = -1) -> bytes: - """Not Implemented.""" - raise NotImplementedError - - -class HttpWriter(IOWriter): - def write(self, url: str, data: bytes) -> None: - """Write file with data. - - Args: - path (str): the path of file, if the path is relative path, it will be joined with parent_dir. - data (bytes): the data want to write - """ - files = {'file': io.BytesIO(data)} - response = requests.post(url, files=files) - assert 300 > response.status_code and response.status_code > 199 diff --git a/magic_pdf/data/io/s3.py b/magic_pdf/data/io/s3.py deleted file mode 100644 index 4222c73fecdeb99283fa2d0ef419d2f3cde06cb5..0000000000000000000000000000000000000000 --- a/magic_pdf/data/io/s3.py +++ /dev/null @@ -1,114 +0,0 @@ -import boto3 -from botocore.config import Config - -from magic_pdf.data.io.base import IOReader, IOWriter - - -class S3Reader(IOReader): - def __init__( - self, - bucket: str, - ak: str, - sk: str, - endpoint_url: str, - addressing_style: str = 'auto', - ): - """s3 reader client. - - Args: - bucket (str): bucket name - ak (str): access key - sk (str): secret key - endpoint_url (str): endpoint url of s3 - addressing_style (str, optional): Defaults to 'auto'. Other valid options here are 'path' and 'virtual' - refer to https://boto3.amazonaws.com/v1/documentation/api/1.9.42/guide/s3.html - """ - self._bucket = bucket - self._ak = ak - self._sk = sk - self._s3_client = boto3.client( - service_name='s3', - aws_access_key_id=ak, - aws_secret_access_key=sk, - endpoint_url=endpoint_url, - config=Config( - s3={'addressing_style': addressing_style}, - retries={'max_attempts': 5, 'mode': 'standard'}, - ), - ) - - def read(self, key: str) -> bytes: - """Read the file. - - Args: - path (str): file path to read - - Returns: - bytes: the content of the file - """ - return self.read_at(key) - - def read_at(self, key: str, offset: int = 0, limit: int = -1) -> bytes: - """Read at offset and limit. - - Args: - path (str): the path of file, if the path is relative path, it will be joined with parent_dir. - offset (int, optional): the number of bytes skipped. Defaults to 0. - limit (int, optional): the length of bytes want to read. Defaults to -1. - - Returns: - bytes: the content of file - """ - if limit > -1: - range_header = f'bytes={offset}-{offset+limit-1}' - res = self._s3_client.get_object( - Bucket=self._bucket, Key=key, Range=range_header - ) - else: - res = self._s3_client.get_object( - Bucket=self._bucket, Key=key, Range=f'bytes={offset}-' - ) - return res['Body'].read() - - -class S3Writer(IOWriter): - def __init__( - self, - bucket: str, - ak: str, - sk: str, - endpoint_url: str, - addressing_style: str = 'auto', - ): - """s3 reader client. - - Args: - bucket (str): bucket name - ak (str): access key - sk (str): secret key - endpoint_url (str): endpoint url of s3 - addressing_style (str, optional): Defaults to 'auto'. Other valid options here are 'path' and 'virtual' - refer to https://boto3.amazonaws.com/v1/documentation/api/1.9.42/guide/s3.html - """ - self._bucket = bucket - self._ak = ak - self._sk = sk - self._s3_client = boto3.client( - service_name='s3', - aws_access_key_id=ak, - aws_secret_access_key=sk, - endpoint_url=endpoint_url, - config=Config( - s3={'addressing_style': addressing_style}, - retries={'max_attempts': 5, 'mode': 'standard'}, - ), - ) - - def write(self, key: str, data: bytes): - """Write file with data. - - Args: - path (str): the path of file, if the path is relative path, it will be joined with parent_dir. - data (bytes): the data want to write - """ - self._s3_client.put_object(Bucket=self._bucket, Key=key, Body=data) diff --git a/magic_pdf/data/read_api.py b/magic_pdf/data/read_api.py deleted file mode 100644 index 9e52af6d8976910975b987529871b0acbae239bb..0000000000000000000000000000000000000000 --- a/magic_pdf/data/read_api.py +++ /dev/null @@ -1,142 +0,0 @@ -import json -import os -import tempfile -import shutil -from pathlib import Path - -from magic_pdf.config.exceptions import EmptyData, InvalidParams -from magic_pdf.data.data_reader_writer import (FileBasedDataReader, - MultiBucketS3DataReader) -from magic_pdf.data.dataset import ImageDataset, PymuDocDataset -from magic_pdf.utils.office_to_pdf import convert_file_to_pdf, ConvertToPdfError - -def read_jsonl( - s3_path_or_local: str, s3_client: MultiBucketS3DataReader | None = None -) -> list[PymuDocDataset]: - """Read the jsonl file and return the list of PymuDocDataset. - - Args: - s3_path_or_local (str): local file or s3 path - s3_client (MultiBucketS3DataReader | None, optional): s3 client that support multiple bucket. Defaults to None. - - Raises: - InvalidParams: if s3_path_or_local is s3 path but s3_client is not provided. - EmptyData: if no pdf file location is provided in some line of jsonl file. - InvalidParams: if the file location is s3 path but s3_client is not provided - - Returns: - list[PymuDocDataset]: each line in the jsonl file will be converted to a PymuDocDataset - """ - bits_arr = [] - if s3_path_or_local.startswith('s3://'): - if s3_client is None: - raise InvalidParams('s3_client is required when s3_path is provided') - jsonl_bits = s3_client.read(s3_path_or_local) - else: - jsonl_bits = FileBasedDataReader('').read(s3_path_or_local) - jsonl_d = [ - json.loads(line) for line in jsonl_bits.decode().split('\n') if line.strip() - ] - for d in jsonl_d: - pdf_path = d.get('file_location', '') or d.get('path', '') - if len(pdf_path) == 0: - raise EmptyData('pdf file location is empty') - if pdf_path.startswith('s3://'): - if s3_client is None: - raise InvalidParams('s3_client is required when s3_path is provided') - bits_arr.append(s3_client.read(pdf_path)) - else: - bits_arr.append(FileBasedDataReader('').read(pdf_path)) - return [PymuDocDataset(bits) for bits in bits_arr] - - -def read_local_pdfs(path: str) -> list[PymuDocDataset]: - """Read pdf from path or directory. - - Args: - path (str): pdf file path or directory that contains pdf files - - Returns: - list[PymuDocDataset]: each pdf file will converted to a PymuDocDataset - """ - if os.path.isdir(path): - reader = FileBasedDataReader() - ret = [] - for root, _, files in os.walk(path): - for file in files: - suffix = file.split('.') - if suffix[-1] == 'pdf': - ret.append( PymuDocDataset(reader.read(os.path.join(root, file)))) - return ret - else: - reader = FileBasedDataReader() - bits = reader.read(path) - return [PymuDocDataset(bits)] - -def read_local_office(path: str) -> list[PymuDocDataset]: - """Read ms-office file (ppt, pptx, doc, docx) from path or directory. - - Args: - path (str): ms-office file or directory that contains ms-office files - - Returns: - list[PymuDocDataset]: each ms-office file will converted to a PymuDocDataset - - Raises: - ConvertToPdfError: Failed to convert ms-office file to pdf via libreoffice - FileNotFoundError: File not Found - Exception: Unknown Exception raised - """ - suffixes = ['.ppt', '.pptx', '.doc', '.docx'] - fns = [] - ret = [] - if os.path.isdir(path): - for root, _, files in os.walk(path): - for file in files: - suffix = Path(file).suffix - if suffix in suffixes: - fns.append((os.path.join(root, file))) - else: - fns.append(path) - - reader = FileBasedDataReader() - temp_dir = tempfile.mkdtemp() - for fn in fns: - try: - convert_file_to_pdf(fn, temp_dir) - except ConvertToPdfError as e: - raise e - except FileNotFoundError as e: - raise e - except Exception as e: - raise e - fn_path = Path(fn) - pdf_fn = f"{temp_dir}/{fn_path.stem}.pdf" - ret.append(PymuDocDataset(reader.read(pdf_fn))) - shutil.rmtree(temp_dir) - return ret - -def read_local_images(path: str, suffixes: list[str]=['.png', '.jpg', '.jpeg']) -> list[ImageDataset]: - """Read images from path or directory. - - Args: - path (str): image file path or directory that contains image files - suffixes (list[str]): the suffixes of the image files used to filter the files. Example: ['.jpg', '.png'] - - Returns: - list[ImageDataset]: each image file will converted to a ImageDataset - """ - if os.path.isdir(path): - imgs_bits = [] - s_suffixes = set(suffixes) - reader = FileBasedDataReader() - for root, _, files in os.walk(path): - for file in files: - suffix = Path(file).suffix - if suffix in s_suffixes: - imgs_bits.append(reader.read(os.path.join(root, file))) - return [ImageDataset(bits) for bits in imgs_bits] - else: - reader = FileBasedDataReader() - bits = reader.read(path) - return [ImageDataset(bits)] diff --git a/magic_pdf/data/schemas.py b/magic_pdf/data/schemas.py deleted file mode 100644 index c2efb46aac565a434bbbd50568c295ca2776db2e..0000000000000000000000000000000000000000 --- a/magic_pdf/data/schemas.py +++ /dev/null @@ -1,19 +0,0 @@ - -from pydantic import BaseModel, Field - - -class S3Config(BaseModel): - """S3 config - """ - bucket_name: str = Field(description='s3 bucket name', min_length=1) - access_key: str = Field(description='s3 access key', min_length=1) - secret_key: str = Field(description='s3 secret key', min_length=1) - endpoint_url: str = Field(description='s3 endpoint url', min_length=1) - addressing_style: str = Field(description='s3 addressing style', default='auto', min_length=1) - - -class PageInfo(BaseModel): - """The width and height of page - """ - w: float = Field(description='the width of page') - h: float = Field(description='the height of page') diff --git a/magic_pdf/data/utils.py b/magic_pdf/data/utils.py deleted file mode 100644 index 849fa780939ddba531029500b158280658af8ea3..0000000000000000000000000000000000000000 --- a/magic_pdf/data/utils.py +++ /dev/null @@ -1,166 +0,0 @@ - -import multiprocessing as mp -import threading -from concurrent.futures import (ProcessPoolExecutor, ThreadPoolExecutor, - as_completed) - -import fitz -import numpy as np -from loguru import logger - - - -def fitz_doc_to_image(page, dpi=200) -> dict: - """Convert fitz.Document to image, Then convert the image to numpy array. - - Args: - page (_type_): pymudoc page - dpi (int, optional): reset the dpi of dpi. Defaults to 200. - - Returns: - dict: {'img': numpy array, 'width': width, 'height': height } - """ - mat = fitz.Matrix(dpi / 72, dpi / 72) - pm = page.get_pixmap(matrix=mat, alpha=False) - - # If the width or height exceeds 4500 after scaling, do not scale further. - if pm.width > 4500 or pm.height > 4500: - pm = page.get_pixmap(matrix=fitz.Matrix(1, 1), alpha=False) - - # Convert pixmap samples directly to numpy array - img = np.frombuffer(pm.samples, dtype=np.uint8).reshape(pm.height, pm.width, 3) - - img_dict = {'img': img, 'width': pm.width, 'height': pm.height} - - return img_dict - -def load_images_from_pdf(pdf_bytes: bytes, dpi=200, start_page_id=0, end_page_id=None) -> list: - images = [] - with fitz.open('pdf', pdf_bytes) as doc: - pdf_page_num = doc.page_count - end_page_id = ( - end_page_id - if end_page_id is not None and end_page_id >= 0 - else pdf_page_num - 1 - ) - if end_page_id > pdf_page_num - 1: - logger.warning('end_page_id is out of range, use images length') - end_page_id = pdf_page_num - 1 - - for index in range(0, doc.page_count): - if start_page_id <= index <= end_page_id: - page = doc[index] - mat = fitz.Matrix(dpi / 72, dpi / 72) - pm = page.get_pixmap(matrix=mat, alpha=False) - - # If the width or height exceeds 4500 after scaling, do not scale further. - if pm.width > 4500 or pm.height > 4500: - pm = page.get_pixmap(matrix=fitz.Matrix(1, 1), alpha=False) - - # Convert pixmap samples directly to numpy array - img = np.frombuffer(pm.samples, dtype=np.uint8).reshape(pm.height, pm.width, 3) - - img_dict = {'img': img, 'width': pm.width, 'height': pm.height} - else: - img_dict = {'img': [], 'width': 0, 'height': 0} - - images.append(img_dict) - return images - - -def convert_page(bytes_page): - pdfs = fitz.open('pdf', bytes_page) - page = pdfs[0] - return fitz_doc_to_image(page) - -def parallel_process_pdf_safe(pages, num_workers=None, **kwargs): - """Process PDF pages in parallel with serialization-safe approach.""" - if num_workers is None: - num_workers = mp.cpu_count() - - - # Process the extracted page data in parallel - with ProcessPoolExecutor(max_workers=num_workers) as executor: - # Process the page data - results = list( - executor.map(convert_page, pages) - ) - - return results - - -def threaded_process_pdf(pdf_path, num_threads=4, **kwargs): - """Process all pages of a PDF using multiple threads. - - Parameters: - ----------- - pdf_path : str - Path to the PDF file - num_threads : int - Number of threads to use - **kwargs : - Additional arguments for fitz_doc_to_image - - Returns: - -------- - images : list - List of processed images, in page order - """ - # Open the PDF - doc = fitz.open(pdf_path) - num_pages = len(doc) - - # Create a list to store results in the correct order - results = [None] * num_pages - - # Create a thread pool - with ThreadPoolExecutor(max_workers=num_threads) as executor: - # Submit all tasks - futures = {} - for page_num in range(num_pages): - page = doc[page_num] - future = executor.submit(fitz_doc_to_image, page, **kwargs) - futures[future] = page_num - # Process results as they complete with progress bar - for future in as_completed(futures): - page_num = futures[future] - try: - results[page_num] = future.result() - except Exception as e: - print(f'Error processing page {page_num}: {e}') - results[page_num] = None - - # Close the document - doc.close() - -if __name__ == '__main__': - pdf = fitz.open('/tmp/[MS-DOC].pdf') - - - pdf_page = [fitz.open() for i in range(pdf.page_count)] - [pdf_page[i].insert_pdf(pdf, from_page=i, to_page=i) for i in range(pdf.page_count)] - - pdf_page = [v.tobytes() for v in pdf_page] - results = parallel_process_pdf_safe(pdf_page, num_workers=16) - - # threaded_process_pdf('/tmp/[MS-DOC].pdf', num_threads=16) - - """ benchmark results of multi-threaded processing (fitz page to image) - total page nums: 578 - thread nums, time cost - 1 7.351 sec - 2 6.334 sec - 4 5.968 sec - 8 6.728 sec - 16 8.085 sec - """ - - """ benchmark results of multi-processor processing (fitz page to image) - total page nums: 578 - processor nums, time cost - 1 17.170 sec - 2 10.170 sec - 4 7.841 sec - 8 7.900 sec - 16 7.984 sec - """ diff --git a/magic_pdf/dict2md/__init__.py b/magic_pdf/dict2md/__init__.py deleted file mode 100644 index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000 diff --git a/magic_pdf/dict2md/ocr_mkcontent.py b/magic_pdf/dict2md/ocr_mkcontent.py deleted file mode 100644 index 997b1832a96be934f93c1c6c680b3de35d79bcc6..0000000000000000000000000000000000000000 --- a/magic_pdf/dict2md/ocr_mkcontent.py +++ /dev/null @@ -1,352 +0,0 @@ -import re - -from loguru import logger - -from magic_pdf.config.make_content_config import DropMode, MakeMode -from magic_pdf.config.ocr_content_type import BlockType, ContentType -from magic_pdf.libs.commons import join_path -from magic_pdf.libs.config_reader import get_latex_delimiter_config -from magic_pdf.libs.language import detect_lang -from magic_pdf.libs.markdown_utils import ocr_escape_special_markdown_char -from magic_pdf.post_proc.para_split_v3 import ListLineTag - - -def __is_hyphen_at_line_end(line): - """Check if a line ends with one or more letters followed by a hyphen. - - Args: - line (str): The line of text to check. - - Returns: - bool: True if the line ends with one or more letters followed by a hyphen, False otherwise. - """ - # Use regex to check if the line ends with one or more letters followed by a hyphen - return bool(re.search(r'[A-Za-z]+-\s*$', line)) - - -def ocr_mk_mm_markdown_with_para_and_pagination(pdf_info_dict: list, - img_buket_path): - markdown_with_para_and_pagination = [] - page_no = 0 - for page_info in pdf_info_dict: - paras_of_layout = page_info.get('para_blocks') - if not paras_of_layout: - markdown_with_para_and_pagination.append({ - 'page_no': - page_no, - 'md_content': - '', - }) - page_no += 1 - continue - page_markdown = ocr_mk_markdown_with_para_core_v2( - paras_of_layout, 'mm', img_buket_path) - markdown_with_para_and_pagination.append({ - 'page_no': - page_no, - 'md_content': - '\n\n'.join(page_markdown) - }) - page_no += 1 - return markdown_with_para_and_pagination - - -def ocr_mk_markdown_with_para_core_v2(paras_of_layout, - mode, - img_buket_path='', - ): - page_markdown = [] - for para_block in paras_of_layout: - para_text = '' - para_type = para_block['type'] - if para_type in [BlockType.Text, BlockType.List, BlockType.Index]: - para_text = merge_para_with_text(para_block) - elif para_type == BlockType.Title: - title_level = get_title_level(para_block) - para_text = f'{"#" * title_level} {merge_para_with_text(para_block)}' - elif para_type == BlockType.InterlineEquation: - para_text = merge_para_with_text(para_block) - elif para_type == BlockType.Image: - if mode == 'nlp': - continue - elif mode == 'mm': - # 检测是否存在图片脚注 - has_image_footnote = any(block['type'] == BlockType.ImageFootnote for block in para_block['blocks']) - # 如果存在图片脚注,则将图片脚注拼接到图片正文后面 - if has_image_footnote: - for block in para_block['blocks']: # 1st.拼image_caption - if block['type'] == BlockType.ImageCaption: - para_text += merge_para_with_text(block) + ' \n' - for block in para_block['blocks']: # 2nd.拼image_body - if block['type'] == BlockType.ImageBody: - for line in block['lines']: - for span in line['spans']: - if span['type'] == ContentType.Image: - if span.get('image_path', ''): - para_text += f"![]({img_buket_path}/{span['image_path']})" - for block in para_block['blocks']: # 3rd.拼image_footnote - if block['type'] == BlockType.ImageFootnote: - para_text += ' \n' + merge_para_with_text(block) - else: - for block in para_block['blocks']: # 1st.拼image_body - if block['type'] == BlockType.ImageBody: - for line in block['lines']: - for span in line['spans']: - if span['type'] == ContentType.Image: - if span.get('image_path', ''): - para_text += f"![]({img_buket_path}/{span['image_path']})" - for block in para_block['blocks']: # 2nd.拼image_caption - if block['type'] == BlockType.ImageCaption: - para_text += ' \n' + merge_para_with_text(block) - elif para_type == BlockType.Table: - if mode == 'nlp': - continue - elif mode == 'mm': - for block in para_block['blocks']: # 1st.拼table_caption - if block['type'] == BlockType.TableCaption: - para_text += merge_para_with_text(block) + ' \n' - for block in para_block['blocks']: # 2nd.拼table_body - if block['type'] == BlockType.TableBody: - for line in block['lines']: - for span in line['spans']: - if span['type'] == ContentType.Table: - # if processed by table model - if span.get('html', ''): - para_text += f"\n{span['html']}\n" - elif span.get('image_path', ''): - para_text += f"![]({img_buket_path}/{span['image_path']})" - for block in para_block['blocks']: # 3rd.拼table_footnote - if block['type'] == BlockType.TableFootnote: - para_text += '\n' + merge_para_with_text(block) + ' ' - - if para_text.strip() == '': - continue - else: - # page_markdown.append(para_text.strip() + ' ') - page_markdown.append(para_text.strip()) - - return page_markdown - - -def detect_language(text): - en_pattern = r'[a-zA-Z]+' - en_matches = re.findall(en_pattern, text) - en_length = sum(len(match) for match in en_matches) - if len(text) > 0: - if en_length / len(text) >= 0.5: - return 'en' - else: - return 'unknown' - else: - return 'empty' - - -def full_to_half(text: str) -> str: - """Convert full-width characters to half-width characters using code point manipulation. - - Args: - text: String containing full-width characters - - Returns: - String with full-width characters converted to half-width - """ - result = [] - for char in text: - code = ord(char) - # Full-width letters and numbers (FF21-FF3A for A-Z, FF41-FF5A for a-z, FF10-FF19 for 0-9) - if (0xFF21 <= code <= 0xFF3A) or (0xFF41 <= code <= 0xFF5A) or (0xFF10 <= code <= 0xFF19): - result.append(chr(code - 0xFEE0)) # Shift to ASCII range - else: - result.append(char) - return ''.join(result) - -latex_delimiters_config = get_latex_delimiter_config() - -default_delimiters = { - 'display': {'left': '$$', 'right': '$$'}, - 'inline': {'left': '$', 'right': '$'} -} - -delimiters = latex_delimiters_config if latex_delimiters_config else default_delimiters - -display_left_delimiter = delimiters['display']['left'] -display_right_delimiter = delimiters['display']['right'] -inline_left_delimiter = delimiters['inline']['left'] -inline_right_delimiter = delimiters['inline']['right'] - -def merge_para_with_text(para_block): - block_text = '' - for line in para_block['lines']: - for span in line['spans']: - if span['type'] in [ContentType.Text]: - span['content'] = full_to_half(span['content']) - block_text += span['content'] - block_lang = detect_lang(block_text) - - para_text = '' - for i, line in enumerate(para_block['lines']): - - if i >= 1 and line.get(ListLineTag.IS_LIST_START_LINE, False): - para_text += ' \n' - - for j, span in enumerate(line['spans']): - - span_type = span['type'] - content = '' - if span_type == ContentType.Text: - content = ocr_escape_special_markdown_char(span['content']) - elif span_type == ContentType.InlineEquation: - content = f"{inline_left_delimiter}{span['content']}{inline_right_delimiter}" - elif span_type == ContentType.InterlineEquation: - content = f"\n{display_left_delimiter}\n{span['content']}\n{display_right_delimiter}\n" - - content = content.strip() - - if content: - langs = ['zh', 'ja', 'ko'] - # logger.info(f'block_lang: {block_lang}, content: {content}') - if block_lang in langs: # 中文/日语/韩文语境下,换行不需要空格分隔,但是如果是行内公式结尾,还是要加空格 - if j == len(line['spans']) - 1 and span_type not in [ContentType.InlineEquation]: - para_text += content - else: - para_text += f'{content} ' - else: - if span_type in [ContentType.Text, ContentType.InlineEquation]: - # 如果span是line的最后一个且末尾带有-连字符,那么末尾不应该加空格,同时应该把-删除 - if j == len(line['spans'])-1 and span_type == ContentType.Text and __is_hyphen_at_line_end(content): - para_text += content[:-1] - else: # 西方文本语境下 content间需要空格分隔 - para_text += f'{content} ' - elif span_type == ContentType.InterlineEquation: - para_text += content - else: - continue - # 连写字符拆分 - # para_text = __replace_ligatures(para_text) - - return para_text - - -def para_to_standard_format_v2(para_block, img_buket_path, page_idx, drop_reason=None): - para_type = para_block['type'] - para_content = {} - if para_type in [BlockType.Text, BlockType.List, BlockType.Index]: - para_content = { - 'type': 'text', - 'text': merge_para_with_text(para_block), - } - elif para_type == BlockType.Title: - para_content = { - 'type': 'text', - 'text': merge_para_with_text(para_block), - } - title_level = get_title_level(para_block) - if title_level != 0: - para_content['text_level'] = title_level - elif para_type == BlockType.InterlineEquation: - para_content = { - 'type': 'equation', - 'text': merge_para_with_text(para_block), - 'text_format': 'latex', - } - elif para_type == BlockType.Image: - para_content = {'type': 'image', 'img_path': '', 'img_caption': [], 'img_footnote': []} - for block in para_block['blocks']: - if block['type'] == BlockType.ImageBody: - for line in block['lines']: - for span in line['spans']: - if span['type'] == ContentType.Image: - if span.get('image_path', ''): - para_content['img_path'] = join_path(img_buket_path, span['image_path']) - if block['type'] == BlockType.ImageCaption: - para_content['img_caption'].append(merge_para_with_text(block)) - if block['type'] == BlockType.ImageFootnote: - para_content['img_footnote'].append(merge_para_with_text(block)) - elif para_type == BlockType.Table: - para_content = {'type': 'table', 'img_path': '', 'table_caption': [], 'table_footnote': []} - for block in para_block['blocks']: - if block['type'] == BlockType.TableBody: - for line in block['lines']: - for span in line['spans']: - if span['type'] == ContentType.Table: - - if span.get('latex', ''): - para_content['table_body'] = f"{span['latex']}" - elif span.get('html', ''): - para_content['table_body'] = f"{span['html']}" - - if span.get('image_path', ''): - para_content['img_path'] = join_path(img_buket_path, span['image_path']) - - if block['type'] == BlockType.TableCaption: - para_content['table_caption'].append(merge_para_with_text(block)) - if block['type'] == BlockType.TableFootnote: - para_content['table_footnote'].append(merge_para_with_text(block)) - - para_content['page_idx'] = page_idx - - if drop_reason is not None: - para_content['drop_reason'] = drop_reason - - return para_content - - -def union_make(pdf_info_dict: list, - make_mode: str, - drop_mode: str, - img_buket_path: str = '', - ): - output_content = [] - for page_info in pdf_info_dict: - drop_reason_flag = False - drop_reason = None - if page_info.get('need_drop', False): - drop_reason = page_info.get('drop_reason') - if drop_mode == DropMode.NONE: - pass - elif drop_mode == DropMode.NONE_WITH_REASON: - drop_reason_flag = True - elif drop_mode == DropMode.WHOLE_PDF: - raise Exception((f'drop_mode is {DropMode.WHOLE_PDF} ,' - f'drop_reason is {drop_reason}')) - elif drop_mode == DropMode.SINGLE_PAGE: - logger.warning((f'drop_mode is {DropMode.SINGLE_PAGE} ,' - f'drop_reason is {drop_reason}')) - continue - else: - raise Exception('drop_mode can not be null') - - paras_of_layout = page_info.get('para_blocks') - page_idx = page_info.get('page_idx') - if not paras_of_layout: - continue - if make_mode == MakeMode.MM_MD: - page_markdown = ocr_mk_markdown_with_para_core_v2( - paras_of_layout, 'mm', img_buket_path) - output_content.extend(page_markdown) - elif make_mode == MakeMode.NLP_MD: - page_markdown = ocr_mk_markdown_with_para_core_v2( - paras_of_layout, 'nlp') - output_content.extend(page_markdown) - elif make_mode == MakeMode.STANDARD_FORMAT: - for para_block in paras_of_layout: - if drop_reason_flag: - para_content = para_to_standard_format_v2( - para_block, img_buket_path, page_idx) - else: - para_content = para_to_standard_format_v2( - para_block, img_buket_path, page_idx) - output_content.append(para_content) - if make_mode in [MakeMode.MM_MD, MakeMode.NLP_MD]: - return '\n\n'.join(output_content) - elif make_mode == MakeMode.STANDARD_FORMAT: - return output_content - - -def get_title_level(block): - title_level = block.get('level', 1) - if title_level > 4: - title_level = 4 - elif title_level < 1: - title_level = 0 - return title_level \ No newline at end of file diff --git a/magic_pdf/filter/__init__.py b/magic_pdf/filter/__init__.py deleted file mode 100644 index 280156358b1417c1526ade41302a7f21b09863e0..0000000000000000000000000000000000000000 --- a/magic_pdf/filter/__init__.py +++ /dev/null @@ -1,32 +0,0 @@ - -from magic_pdf.config.drop_reason import DropReason -from magic_pdf.config.enums import SupportedPdfParseMethod -from magic_pdf.filter.pdf_classify_by_type import classify as do_classify -from magic_pdf.filter.pdf_meta_scan import pdf_meta_scan - - -def classify(pdf_bytes: bytes) -> SupportedPdfParseMethod: - """根据pdf的元数据,判断是文本pdf,还是ocr pdf.""" - pdf_meta = pdf_meta_scan(pdf_bytes) - if pdf_meta.get('_need_drop', False): # 如果返回了需要丢弃的标志,则抛出异常 - raise Exception(f"pdf meta_scan need_drop,reason is {pdf_meta['_drop_reason']}") - else: - is_encrypted = pdf_meta['is_encrypted'] - is_needs_password = pdf_meta['is_needs_password'] - if is_encrypted or is_needs_password: # 加密的,需要密码的,没有页面的,都不处理 - raise Exception(f'pdf meta_scan need_drop,reason is {DropReason.ENCRYPTED}') - else: - is_text_pdf, results = do_classify( - pdf_meta['total_page'], - pdf_meta['page_width_pts'], - pdf_meta['page_height_pts'], - pdf_meta['image_info_per_page'], - pdf_meta['text_len_per_page'], - pdf_meta['imgs_per_page'], - # pdf_meta['text_layout_per_page'], - pdf_meta['invalid_chars'], - ) - if is_text_pdf: - return SupportedPdfParseMethod.TXT - else: - return SupportedPdfParseMethod.OCR diff --git a/magic_pdf/filter/pdf_classify_by_type.py b/magic_pdf/filter/pdf_classify_by_type.py deleted file mode 100644 index 50665737287c2d1798924c3aa30980ce280a3c7d..0000000000000000000000000000000000000000 --- a/magic_pdf/filter/pdf_classify_by_type.py +++ /dev/null @@ -1,395 +0,0 @@ -""" -根据利用meta_scan得到的结果,对pdf是否为文字版进行分类。 -定义标准: -一、什么pdf会是文字pdf,只要满足以下任意一条 - 1. 随机抽取N页,如果有任何一页文字数目大于100 - 2. 只要存在一个页面,图片的数量为0 -二、什么是扫描版pdf,只要满足以下任意一条 - 1. ~~80%页面上的最大图大小一样并且面积超过页面面积0.6~~ - 2. 大部分页面上文字的长度都是相等的。 - -""" -import json -import sys -from collections import Counter - -import click -import numpy as np -from loguru import logger - -from magic_pdf.libs.commons import mymax, get_top_percent_list -from magic_pdf.filter.pdf_meta_scan import scan_max_page, junk_limit_min - -TEXT_LEN_THRESHOLD = 100 -AVG_TEXT_LEN_THRESHOLD = 100 -TEXT_LEN_SAMPLE_RATIO = 0.1 # 抽取0.1的页面进行文字长度统计 - - -# 一个拼接图片的方案,将某些特殊扫描版本的拆图拼成一张整图 -def merge_images(image_list, page_width, page_height, max_offset=5, max_gap=2): - # 先通过set去除所有bbox重叠的图片数据 - image_list_result = [] - for page_images in image_list: - page_result = [] - dedup = set() - for img in page_images: - x0, y0, x1, y1, img_bojid = img - if (x0, y0, x1, y1) in dedup: # 这里面会出现一些重复的bbox,无需重复出现,需要去掉 - continue - else: - dedup.add((x0, y0, x1, y1)) - page_result.append([x0, y0, x1, y1, img_bojid]) - image_list_result.append(page_result) - - # 接下来,将同一页可拼接的图片进行合并 - merged_images = [] - for page_images in image_list_result: - if not page_images: - continue - - # 先将同一页的图片从上到下,从左到右进行排序 - page_images.sort(key=lambda img: (img[1], img[0])) - - merged = [page_images[0]] - - for img in page_images[1:]: - x0, y0, x1, y1, imgid = img - - last_img = merged[-1] - last_x0, last_y0, last_x1, last_y1, last_imgid = last_img - - # 单张图片宽或者高覆盖页面宽高的9成以上是拼图的一个前置条件 - full_width = abs(x1 - x0) >= page_width * 0.9 - full_height = abs(y1 - y0) >= page_height * 0.9 - - # 如果宽达标,检测是否能竖着拼 - if full_width: - # 竖着拼需要满足两个前提,左右边界各偏移不能超过 max_offset,第一张图的下边界和第二张图的上边界偏移不能超过 max_gap - close1 = (last_x0 - max_offset) <= x0 <= (last_x0 + max_offset) and (last_x1 - max_offset) <= x1 <= ( - last_x1 + max_offset) and (last_y1 - max_gap) <= y0 <= (last_y1 + max_gap) - - # 如果高达标,检测是否可以横着拼 - if full_height: - # 横着拼需要满足两个前提,上下边界各偏移不能超过 max_offset,第一张图的右边界和第二张图的左边界偏移不能超过 max_gap - close2 = (last_y0 - max_offset) <= y0 <= (last_y0 + max_offset) and (last_y1 - max_offset) <= y1 <= ( - last_y1 + max_offset) and (last_x1 - max_gap) <= x0 <= (last_x1 + max_gap) - - # Check if the image can be merged with the last image - if (full_width and close1) or (full_height and close2): - # Merge the image with the last image - merged[-1] = [min(x0, last_x0), min(y0, last_y0), - max(x1, last_x1), max(y1, last_y1), imgid] - else: - # Add the image as a new image - merged.append(img) - - merged_images.append(merged) - - return merged_images - - -def classify_by_area(total_page: int, page_width, page_height, img_sz_list, text_len_list: list): - """ - 80%页面上的最大图大小一样并且面积超过页面面积0.6则返回False,否则返回True - :param pdf_path: - :param total_page: - :param page_width: - :param page_height: - :param img_sz_list: - :return: - """ - # # 只要有一页没有图片,那么就是文字pdf。但是同时还需要满足一个条件就是这个页面上同时不能有文字。发现过一些扫描版pdf,上面有一些空白页面,既没有图片也没有文字。 - # if any([len(img_sz) == 0 for img_sz in img_sz_list]): # 含有不含图片的页面 - # # 现在找到这些页面的index - # empty_page_index = [i for i, img_sz in enumerate(img_sz_list) if len(img_sz) == 0] - # # 然后检查这些页面上是否有文字 - # text_len_at_page_idx = [text_len for i, text_len in enumerate(text_len_list) if i in empty_page_index and text_len > 0] - # if len(text_len_at_page_idx) > TEXT_LEN_THRESHOLD: # 没有图片,但是有文字,说明可能是个文字版,如果没有文字则无法判断,留给下一步,现在要求这页文字量超过一定阈值 - # return True - - # 通过objid去掉重复出现10次以上的图片,这些图片是隐藏的透明图层,其特点是id都一样 - # 先对每个id出现的次数做个统计 - objid_cnt = Counter([objid for page_img_sz in img_sz_list for _, _, _, _, objid in page_img_sz]) - # 再去掉出现次数大于10的 - if total_page >= scan_max_page: # 新的meta_scan只扫描前 scan_max_page 页,页数大于 scan_max_page 当total_page为 scan_max_page - total_page = scan_max_page - - repeat_threshold = 2 # 把bad_image的阈值设为2 - # repeat_threshold = min(2, total_page) # 当total_page为1时,repeat_threshold为1,会产生误判导致所有img变成bad_img - bad_image_objid = set([objid for objid, cnt in objid_cnt.items() if cnt >= repeat_threshold]) - # bad_image_page_idx = [i for i, page_img_sz in enumerate(img_sz_list) if any([objid in bad_image_objid for _, _, _, _, objid in page_img_sz])] - # text_len_at_bad_image_page_idx = [text_len for i, text_len in enumerate(text_len_list) if i in bad_image_page_idx and text_len > 0] - - # 特殊情况,一个文字版pdf,每页覆盖一个超大的透明图片,超大的定义是图片占整页面积的90%以上 - # fake_image_ids = [objid for objid in bad_image_objid if - # any([abs((x1 - x0) * (y1 - y0) / page_width * page_height) > 0.9 for images in img_sz_list for - # x0, y0, x1, y1, _ in images])] # 原来的代码,any里面恒为true了,原因??? - # fake_image_ids = [objid for objid in bad_image_objid for images in img_sz_list for x0, y0, x1, y1, img_id in images - # if img_id == objid and abs((x1 - x0) * (y1 - y0)) / (page_width * page_height) > 0.9] - - # if len(fake_image_ids) > 0 and any([l > TEXT_LEN_THRESHOLD for l in text_len_at_bad_image_page_idx]): # 这些透明图片所在的页面上有文字大于阈值 - # return True - - img_sz_list = [[img_sz for img_sz in page_img_sz if img_sz[-1] not in bad_image_objid] for page_img_sz in - img_sz_list] # 过滤掉重复出现的图片 - - # 有的扫描版会把一页图片拆成很多张,需要先把图拼起来再计算 - img_sz_list = merge_images(img_sz_list, page_width, page_height) - - # 计算每个页面上最大的图的面积,然后计算这个面积占页面面积的比例 - max_image_area_per_page = [mymax([(x1 - x0) * (y1 - y0) for x0, y0, x1, y1, _ in page_img_sz]) for page_img_sz in - img_sz_list] - page_area = page_width * page_height - max_image_area_per_page = [area / page_area for area in max_image_area_per_page] - max_image_area_per_page = [area for area in max_image_area_per_page if area > 0.5] - - if len(max_image_area_per_page) >= 0.5 * total_page: # 阈值从0.8改到0.5,适配3页里面有两页和两页里面有一页的情况 - # 这里条件成立的前提是把反复出现的图片去掉了。这些图片是隐藏的透明图层,其特点是id都一样 - return False - else: - return True - - -def classify_by_text_len(text_len_list: list, total_page: int): - """ - 随机抽取10%的页面,如果少于5个页面,那么就取全部页面。 - 查看页面上的文字长度,如果有任何一个页面的文字长度大于TEXT_LEN_THRESHOLD,那么就是文字pdf - :param total_page: - :param text_len_list: - :return: - """ - select_page_cnt = int(total_page * TEXT_LEN_SAMPLE_RATIO) # 选取10%的页面 - if select_page_cnt < 5: - select_page_cnt = total_page - - # # 排除头尾各10页 - # if total_page > 20: # 如果总页数大于20 - # page_range = list(range(10, total_page - 10)) # 从第11页到倒数第11页 - # else: - # page_range = list(range(total_page)) # 否则选择所有页面 - # page_num = np.random.choice(page_range, min(select_page_cnt, len(page_range)), replace=False) - # 排除前后10页对只有21,22页的pdf很尴尬,如果选出来的中间那一两页恰好没字容易误判,有了avg_words规则,这个规则可以忽略 - page_num = np.random.choice(total_page, select_page_cnt, replace=False) - text_len_lst = [text_len_list[i] for i in page_num] - is_text_pdf = any([text_len > TEXT_LEN_THRESHOLD for text_len in text_len_lst]) - return is_text_pdf - - -def classify_by_avg_words(text_len_list: list): - """ - 补充规则,如果平均每页字数少于 AVG_TEXT_LEN_THRESHOLD,就不是文字pdf - 主要是各种图集 - :param text_len_list: - :return: - """ - sum_words = sum(text_len_list) - count_of_numbers = len(text_len_list) - if count_of_numbers == 0: - is_text_pdf = False - else: - avg_words = round(sum_words / count_of_numbers) - if avg_words > AVG_TEXT_LEN_THRESHOLD: - is_text_pdf = True - else: - is_text_pdf = False - - return is_text_pdf - - -def classify_by_img_num(img_sz_list: list, img_num_list: list): - """ - 补充规则,有一种扫描版本的PDF,每一页都会放所有的扫描页进去,在 metascan 时会被去重, - 这种pdf的 metasca 扫描结果的特点是 img_sz_list 内全是空元素,img_num_list中每一页的数量都很大且相同 - :param img_sz_list: - :param img_num_list: - :return: - """ - # 计算img_sz_list中非空元素的个数 - count_img_sz_list_not_none = sum(1 for item in img_sz_list if item) - # 获取前80%的元素 - top_eighty_percent = get_top_percent_list(img_num_list, 0.8) - # img_sz_list中非空元素的个数小于1,前80%的元素都相等,且最大值大于等于junk_limit_min - if count_img_sz_list_not_none <= 1 and len(set(top_eighty_percent)) == 1 and max(img_num_list) >= junk_limit_min: - - #拿max和min的值,用来判断list内的值是否全都相等 - # min_imgs = min(img_num_list) - # max_imgs = max(img_num_list) - # - # if count_img_sz_list_not_none == 0 and max_imgs == min_imgs and max_imgs >= junk_limit_min: - return False # 如果满足这个条件,一定不是文字版pdf - else: - return True # 不满足这三个条件,可能是文字版pdf,通过其他规则判断 - - -def classify_by_text_layout(text_layout_per_page: list): - """ - 判断文本布局是否以竖排为主。 - - Args: - text_layout_per_page (list): 文本布局列表,列表中的每个元素表示一页的文本布局, - 值为'vertical'表示竖排,值为'horizontal'表示横排。 - - Returns: - bool: 若文本布局以竖排为主,则返回False;否则返回True。 - """ - # 统计text_layout_per_page中竖排的个数 - count_vertical = sum(1 for item in text_layout_per_page if item == 'vertical') - # 统计text_layout_per_page中横排的个数 - count_horizontal = sum(1 for item in text_layout_per_page if item == 'horizontal') - # 计算text_layout_per_page中竖排的占比 - known_layout_cnt = count_vertical + count_horizontal - if known_layout_cnt != 0: - ratio = count_vertical / known_layout_cnt - if ratio >= 0.5: # 阈值设为0.5,适配3页里面有2页和两页里有一页的情况 - return False # 文本布局以竖排为主,认为不是文字版pdf - else: - return True # 文本布局以横排为主,认为是文字版pdf - else: - return False # 文本布局未知,默认认为不是文字版pdf - - -def classify_by_img_narrow_strips(page_width, page_height, img_sz_list): - """ - 判断一页是否由细长条组成,有两个条件: - 1. 图片的宽或高达到页面宽或高的90%,且长边需要是窄边长度的数倍以上 - 2. 整个页面所有的图片有80%以上满足条件1 - - Args: - page_width (float): 页面宽度 - page_height (float): 页面高度 - img_sz_list (list): 图片尺寸列表,每个元素为一个元组,表示图片的矩形区域和尺寸,形如(x0, y0, x1, y1, size),其中(x0, y0)为矩形区域的左上角坐标,(x1, y1)为矩形区域的右下角坐标,size为图片的尺寸 - - Returns: - bool: 如果满足条件的页面的比例小于0.5,返回True,否则返回False - """ - - def is_narrow_strip(img): - x0, y0, x1, y1, _ = img - width, height = x1 - x0, y1 - y0 - return any([ - # 图片宽度大于等于页面宽度的90%,且宽度大于等于高度4倍 - width >= page_width * 0.9 and width >= height * 4, - # 图片高度大于等于页面高度的90%,且高度大于等于宽度4倍 - height >= page_height * 0.9 and height >= width * 4, - ]) - - # 初始化满足条件的页面数量 - narrow_strip_pages_count = 0 - - # 遍历所有页面 - for page_img_list in img_sz_list: - # 忽略空页面 - if not page_img_list: - continue - - # 计算页面中的图片总数 - total_images = len(page_img_list) - - # 计算页面中细长条图片的数量 - narrow_strip_images_count = 0 - for img in page_img_list: - if is_narrow_strip(img): - narrow_strip_images_count += 1 - # 如果细长条图片的数量少于5,跳过 - if narrow_strip_images_count < 5: - continue - else: - # 如果细长条图片的比例大于或等于0.8,增加满足条件的页面数量 - if narrow_strip_images_count / total_images >= 0.8: - narrow_strip_pages_count += 1 - - # 计算满足条件的页面的比例 - narrow_strip_pages_ratio = narrow_strip_pages_count / len(img_sz_list) - - return narrow_strip_pages_ratio < 0.5 - - -def classify(total_page: int, page_width, page_height, img_sz_list: list, text_len_list: list, img_num_list: list, - # text_layout_list: list, - invalid_chars: bool): - """ - 这里的图片和页面长度单位是pts - :param total_page: - :param text_len_list: - :param page_width: - :param page_height: - :param img_sz_list: - :param pdf_path: - :return: - """ - results = { - 'by_image_area': classify_by_area(total_page, page_width, page_height, img_sz_list, text_len_list), - 'by_text_len': classify_by_text_len(text_len_list, total_page), - 'by_avg_words': classify_by_avg_words(text_len_list), - 'by_img_num': classify_by_img_num(img_sz_list, img_num_list), - # 'by_text_layout': classify_by_text_layout(text_layout_list), - 'by_img_narrow_strips': classify_by_img_narrow_strips(page_width, page_height, img_sz_list), - 'by_invalid_chars': invalid_chars, - } - - if all(results.values()): - return True, results - elif not any(results.values()): - return False, results - else: - logger.warning( - f"OCR needed based on classification result, by_image_area: {results['by_image_area']}," - f" by_text: {results['by_text_len']}, by_avg_words: {results['by_avg_words']}, by_img_num: {results['by_img_num']}," - # f" by_text_layout: {results['by_text_layout']}," - f" by_img_narrow_strips: {results['by_img_narrow_strips']}," - f" by_invalid_chars: {results['by_invalid_chars']}", - file=sys.stderr) # 利用这种情况可以快速找出来哪些pdf比较特殊,针对性修正分类算法 - return False, results - - -@click.command() -@click.option("--json-file", type=str, help="pdf信息") -def main(json_file): - if json_file is None: - print("json_file is None", file=sys.stderr) - exit(0) - try: - with open(json_file, "r") as f: - for l in f: - if l.strip() == "": - continue - o = json.loads(l) - total_page = o["total_page"] - page_width = o["page_width_pts"] - page_height = o["page_height_pts"] - img_sz_list = o["image_info_per_page"] - text_len_list = o['text_len_per_page'] - text_layout_list = o['text_layout_per_page'] - pdf_path = o['pdf_path'] - is_encrypted = o['is_encrypted'] - is_needs_password = o['is_needs_password'] - if is_encrypted or total_page == 0 or is_needs_password: # 加密的,需要密码的,没有页面的,都不处理 - continue - tag = classify(total_page, page_width, page_height, img_sz_list, text_len_list, text_layout_list) - o['is_text_pdf'] = tag - print(json.dumps(o, ensure_ascii=False)) - except Exception as e: - print("ERROR: ", e, file=sys.stderr) - - -if __name__ == "__main__": - main() - # false = False - # true = True - # null = None - # o = {"pdf_path":"s3://llm-raw-snew/llm-raw-the-eye/raw/World%20Tracker%20Library/worldtracker.org/media/library/Science/Computer%20Science/Shreiner%20-%20OpenGL%20Programming%20Guide%206e%20%5BThe%20Redbook%5D%20%28AW%2C%202008%29.pdf","is_needs_password":false,"is_encrypted":false,"total_page":978,"page_width_pts":368,"page_height_pts":513,"image_info_per_page":[[[0,0,368,513,10037]],[[0,0,368,513,4]],[[0,0,368,513,7]],[[0,0,368,513,10]],[[0,0,368,513,13]],[[0,0,368,513,16]],[[0,0,368,513,19]],[[0,0,368,513,22]],[[0,0,368,513,25]],[[0,0,368,513,28]],[[0,0,368,513,31]],[[0,0,368,513,34]],[[0,0,368,513,37]],[[0,0,368,513,40]],[[0,0,368,513,43]],[[0,0,368,513,46]],[[0,0,368,513,49]],[[0,0,368,513,52]],[[0,0,368,513,55]],[[0,0,368,513,58]],[[0,0,368,513,61]],[[0,0,368,513,64]],[[0,0,368,513,67]],[[0,0,368,513,70]],[[0,0,368,513,73]],[[0,0,368,516,76]],[[0,0,368,516,79]],[[0,0,368,513,82]],[[0,0,368,513,85]],[[0,0,368,513,88]],[[0,0,368,513,91]],[[0,0,368,513,94]],[[0,0,368,513,97]],[[0,0,368,513,100]],[[0,0,368,513,103]],[[0,0,368,513,106]],[[0,0,368,513,109]],[[0,0,368,513,112]],[[0,0,368,513,115]],[[0,0,368,513,118]],[[0,0,368,513,121]],[[0,0,368,513,124]],[[0,0,368,513,127]],[[0,0,368,513,130]],[[0,0,368,513,133]],[[0,0,368,513,136]],[[0,0,368,513,139]],[[0,0,368,513,142]],[[0,0,368,513,145]],[[0,0,368,513,148]],[[0,0,368,513,151]],[[0,0,368,513,154]],[[0,0,368,513,157]],[[0,0,368,513,160]],[[0,0,368,513,163]],[[0,0,368,513,166]],[[0,0,368,513,169]],[[0,0,368,513,172]],[[0,0,368,513,175]],[[0,0,368,513,178]],[[0,0,368,513,181]],[[0,0,368,513,184]],[[0,0,368,513,187]],[[0,0,368,513,190]],[[0,0,368,513,193]],[[0,0,368,513,196]],[[0,0,368,513,199]],[[0,0,368,513,202]],[[0,0,368,513,205]],[[0,0,368,513,208]],[[0,0,368,513,211]],[[0,0,368,513,214]],[[0,0,368,513,217]],[[0,0,368,513,220]],[[0,0,368,513,223]],[[0,0,368,513,226]],[[0,0,368,513,229]],[[0,0,368,513,232]],[[0,0,368,513,235]],[[0,0,368,513,238]],[[0,0,368,513,241]],[[0,0,368,513,244]],[[0,0,368,513,247]],[[0,0,368,513,250]],[[0,0,368,513,253]],[[0,0,368,513,256]],[[0,0,368,513,259]],[[0,0,368,513,262]],[[0,0,368,513,265]],[[0,0,368,513,268]],[[0,0,368,513,271]],[[0,0,368,513,274]],[[0,0,368,513,277]],[[0,0,368,513,280]],[[0,0,368,513,283]],[[0,0,368,513,286]],[[0,0,368,513,289]],[[0,0,368,513,292]],[[0,0,368,513,295]],[[0,0,368,513,298]],[[0,0,368,513,301]],[[0,0,368,513,304]],[[0,0,368,513,307]],[[0,0,368,513,310]],[[0,0,368,513,313]],[[0,0,368,513,316]],[[0,0,368,513,319]],[[0,0,368,513,322]],[[0,0,368,513,325]],[[0,0,368,513,328]],[[0,0,368,513,331]],[[0,0,368,513,334]],[[0,0,368,513,337]],[[0,0,368,513,340]],[[0,0,368,513,343]],[[0,0,368,513,346]],[[0,0,368,513,349]],[[0,0,368,513,352]],[[0,0,368,513,355]],[[0,0,368,513,358]],[[0,0,368,513,361]],[[0,0,368,513,364]],[[0,0,368,513,367]],[[0,0,368,513,370]],[[0,0,368,513,373]],[[0,0,368,513,376]],[[0,0,368,513,379]],[[0,0,368,513,382]],[[0,0,368,513,385]],[[0,0,368,513,388]],[[0,0,368,513,391]],[[0,0,368,513,394]],[[0,0,368,513,397]],[[0,0,368,513,400]],[[0,0,368,513,403]],[[0,0,368,513,406]],[[0,0,368,513,409]],[[0,0,368,513,412]],[[0,0,368,513,415]],[[0,0,368,513,418]],[[0,0,368,513,421]],[[0,0,368,513,424]],[[0,0,368,513,427]],[[0,0,368,513,430]],[[0,0,368,513,433]],[[0,0,368,513,436]],[[0,0,368,513,439]],[[0,0,368,513,442]],[[0,0,368,513,445]],[[0,0,368,513,448]],[[0,0,368,513,451]],[[0,0,368,513,454]],[[0,0,368,513,457]],[[0,0,368,513,460]],[[0,0,368,513,463]],[[0,0,368,513,466]],[[0,0,368,513,469]],[[0,0,368,513,472]],[[0,0,368,513,475]],[[0,0,368,513,478]],[[0,0,368,513,481]],[[0,0,368,513,484]],[[0,0,368,513,487]],[[0,0,368,513,490]],[[0,0,368,513,493]],[[0,0,368,513,496]],[[0,0,368,513,499]],[[0,0,368,513,502]],[[0,0,368,513,505]],[[0,0,368,513,508]],[[0,0,368,513,511]],[[0,0,368,513,514]],[[0,0,368,513,517]],[[0,0,368,513,520]],[[0,0,368,513,523]],[[0,0,368,513,526]],[[0,0,368,513,529]],[[0,0,368,513,532]],[[0,0,368,513,535]],[[0,0,368,513,538]],[[0,0,368,513,541]],[[0,0,368,513,544]],[[0,0,368,513,547]],[[0,0,368,513,550]],[[0,0,368,513,553]],[[0,0,368,513,556]],[[0,0,368,513,559]],[[0,0,368,513,562]],[[0,0,368,513,565]],[[0,0,368,513,568]],[[0,0,368,513,571]],[[0,0,368,513,574]],[[0,0,368,513,577]],[[0,0,368,513,580]],[[0,0,368,513,583]],[[0,0,368,513,586]],[[0,0,368,513,589]],[[0,0,368,513,592]],[[0,0,368,513,595]],[[0,0,368,513,598]],[[0,0,368,513,601]],[[0,0,368,513,604]],[[0,0,368,513,607]],[[0,0,368,513,610]],[[0,0,368,513,613]],[[0,0,368,513,616]],[[0,0,368,513,619]],[[0,0,368,513,622]],[[0,0,368,513,625]],[[0,0,368,513,628]],[[0,0,368,513,631]],[[0,0,368,513,634]],[[0,0,368,513,637]],[[0,0,368,513,640]],[[0,0,368,513,643]],[[0,0,368,513,646]],[[0,0,368,513,649]],[[0,0,368,513,652]],[[0,0,368,513,655]],[[0,0,368,513,658]],[[0,0,368,513,661]],[[0,0,368,513,664]],[[0,0,368,513,667]],[[0,0,368,513,670]],[[0,0,368,513,673]],[[0,0,368,513,676]],[[0,0,368,513,679]],[[0,0,368,513,682]],[[0,0,368,513,685]],[[0,0,368,513,688]],[[0,0,368,513,691]],[[0,0,368,513,694]],[[0,0,368,513,697]],[[0,0,368,513,700]],[[0,0,368,513,703]],[[0,0,368,513,706]],[[0,0,368,513,709]],[[0,0,368,513,712]],[[0,0,368,513,715]],[[0,0,368,513,718]],[[0,0,368,513,721]],[[0,0,368,513,724]],[[0,0,368,513,727]],[[0,0,368,513,730]],[[0,0,368,513,733]],[[0,0,368,513,736]],[[0,0,368,513,739]],[[0,0,368,513,742]],[[0,0,368,513,745]],[[0,0,368,513,748]],[[0,0,368,513,751]],[[0,0,368,513,754]],[[0,0,368,513,757]],[[0,0,368,513,760]],[[0,0,368,513,763]],[[0,0,368,513,766]],[[0,0,368,513,769]],[[0,0,368,513,772]],[[0,0,368,513,775]],[[0,0,368,513,778]],[[0,0,368,513,781]],[[0,0,368,513,784]],[[0,0,368,513,787]],[[0,0,368,513,790]],[[0,0,368,513,793]],[[0,0,368,513,796]],[[0,0,368,513,799]],[[0,0,368,513,802]],[[0,0,368,513,805]],[[0,0,368,513,808]],[[0,0,368,513,811]],[[0,0,368,513,814]],[[0,0,368,513,817]],[[0,0,368,513,820]],[[0,0,368,513,823]],[[0,0,368,513,826]],[[0,0,368,513,829]],[[0,0,368,513,832]],[[0,0,368,513,835]],[[0,0,368,513,838]],[[0,0,368,513,841]],[[0,0,368,513,844]],[[0,0,368,513,847]],[[0,0,368,513,850]],[[0,0,368,513,853]],[[0,0,368,513,856]],[[0,0,368,513,859]],[[0,0,368,513,862]],[[0,0,368,513,865]],[[0,0,368,513,868]],[[0,0,368,513,871]],[[0,0,368,513,874]],[[0,0,368,513,877]],[[0,0,368,513,880]],[[0,0,368,513,883]],[[0,0,368,513,886]],[[0,0,368,513,889]],[[0,0,368,513,892]],[[0,0,368,513,895]],[[0,0,368,513,898]],[[0,0,368,513,901]],[[0,0,368,513,904]],[[0,0,368,513,907]],[[0,0,368,513,910]],[[0,0,368,513,913]],[[0,0,368,513,916]],[[0,0,368,513,919]],[[0,0,368,513,922]],[[0,0,368,513,925]],[[0,0,368,513,928]],[[0,0,368,513,931]],[[0,0,368,513,934]],[[0,0,368,513,937]],[[0,0,368,513,940]],[[0,0,368,513,943]],[[0,0,368,513,946]],[[0,0,368,513,949]],[[0,0,368,513,952]],[[0,0,368,513,955]],[[0,0,368,513,958]],[[0,0,368,513,961]],[[0,0,368,513,964]],[[0,0,368,513,967]],[[0,0,368,513,970]],[[0,0,368,513,973]],[[0,0,368,513,976]],[[0,0,368,513,979]],[[0,0,368,513,982]],[[0,0,368,513,985]],[[0,0,368,513,988]],[[0,0,368,513,991]],[[0,0,368,513,994]],[[0,0,368,513,997]],[[0,0,368,513,1000]],[[0,0,368,513,1003]],[[0,0,368,513,1006]],[[0,0,368,513,1009]],[[0,0,368,513,1012]],[[0,0,368,513,1015]],[[0,0,368,513,1018]],[[0,0,368,513,2797]],[[0,0,368,513,2798]],[[0,0,368,513,2799]],[[0,0,368,513,2800]],[[0,0,368,513,2801]],[[0,0,368,513,2802]],[[0,0,368,513,2803]],[[0,0,368,513,2804]],[[0,0,368,513,2805]],[[0,0,368,513,2806]],[[0,0,368,513,2807]],[[0,0,368,513,2808]],[[0,0,368,513,2809]],[[0,0,368,513,2810]],[[0,0,368,513,2811]],[[0,0,368,513,2812]],[[0,0,368,513,2813]],[[0,0,368,513,2814]],[[0,0,368,513,2815]],[[0,0,368,513,2816]],[[0,0,368,513,2817]],[[0,0,368,513,2818]],[[0,0,368,513,2819]],[[0,0,368,513,2820]],[[0,0,368,513,2821]],[[0,0,368,513,2822]],[[0,0,368,513,2823]],[[0,0,368,513,2824]],[[0,0,368,513,2825]],[[0,0,368,513,2826]],[[0,0,368,513,2827]],[[0,0,368,513,2828]],[[0,0,368,513,2829]],[[0,0,368,513,2830]],[[0,0,368,513,2831]],[[0,0,368,513,2832]],[[0,0,368,513,2833]],[[0,0,368,513,2834]],[[0,0,368,513,2835]],[[0,0,368,513,2836]],[[0,0,368,513,2837]],[[0,0,368,513,2838]],[[0,0,368,513,2839]],[[0,0,368,513,2840]],[[0,0,368,513,2841]],[[0,0,368,513,2842]],[[0,0,368,513,2843]],[[0,0,368,513,2844]],[[0,0,368,513,2845]],[[0,0,368,513,2846]],[[0,0,368,513,2847]],[[0,0,368,513,2848]],[[0,0,368,513,2849]],[[0,0,368,513,2850]],[[0,0,368,513,2851]],[[0,0,368,513,2852]],[[0,0,368,513,2853]],[[0,0,368,513,2854]],[[0,0,368,513,2855]],[[0,0,368,513,2856]],[[0,0,368,513,2857]],[[0,0,368,513,2858]],[[0,0,368,513,2859]],[[0,0,368,513,2860]],[[0,0,368,513,2861]],[[0,0,368,513,2862]],[[0,0,368,513,2863]],[[0,0,368,513,2864]],[[0,0,368,513,2797]],[[0,0,368,513,2798]],[[0,0,368,513,2799]],[[0,0,368,513,2800]],[[0,0,368,513,2801]],[[0,0,368,513,2802]],[[0,0,368,513,2803]],[[0,0,368,513,2804]],[[0,0,368,513,2805]],[[0,0,368,513,2806]],[[0,0,368,513,2807]],[[0,0,368,513,2808]],[[0,0,368,513,2809]],[[0,0,368,513,2810]],[[0,0,368,513,2811]],[[0,0,368,513,2812]],[[0,0,368,513,2813]],[[0,0,368,513,2814]],[[0,0,368,513,2815]],[[0,0,368,513,2816]],[[0,0,368,513,2817]],[[0,0,368,513,2818]],[[0,0,368,513,2819]],[[0,0,368,513,2820]],[[0,0,368,513,2821]],[[0,0,368,513,2822]],[[0,0,368,513,2823]],[[0,0,368,513,2824]],[[0,0,368,513,2825]],[[0,0,368,513,2826]],[[0,0,368,513,2827]],[[0,0,368,513,2828]],[[0,0,368,513,2829]],[[0,0,368,513,2830]],[[0,0,368,513,2831]],[[0,0,368,513,2832]],[[0,0,368,513,2833]],[[0,0,368,513,2834]],[[0,0,368,513,2835]],[[0,0,368,513,2836]],[[0,0,368,513,2837]],[[0,0,368,513,2838]],[[0,0,368,513,2839]],[[0,0,368,513,2840]],[[0,0,368,513,2841]],[[0,0,368,513,2842]],[[0,0,368,513,2843]],[[0,0,368,513,2844]],[[0,0,368,513,2845]],[[0,0,368,513,2846]],[[0,0,368,513,2847]],[[0,0,368,513,2848]],[[0,0,368,513,2849]],[[0,0,368,513,2850]],[[0,0,368,513,2851]],[[0,0,368,513,2852]],[[0,0,368,513,2853]],[[0,0,368,513,2854]],[[0,0,368,513,2855]],[[0,0,368,513,2856]],[[0,0,368,513,2857]],[[0,0,368,513,2858]],[[0,0,368,513,2859]],[[0,0,368,513,2860]],[[0,0,368,513,2861]],[[0,0,368,513,2862]],[[0,0,368,513,2863]],[[0,0,368,513,2864]],[[0,0,368,513,1293]],[[0,0,368,513,1296]],[[0,0,368,513,1299]],[[0,0,368,513,1302]],[[0,0,368,513,1305]],[[0,0,368,513,1308]],[[0,0,368,513,1311]],[[0,0,368,513,1314]],[[0,0,368,513,1317]],[[0,0,368,513,1320]],[[0,0,368,513,1323]],[[0,0,368,513,1326]],[[0,0,368,513,1329]],[[0,0,368,513,1332]],[[0,0,368,513,1335]],[[0,0,368,513,1338]],[[0,0,368,513,1341]],[[0,0,368,513,1344]],[[0,0,368,513,1347]],[[0,0,368,513,1350]],[[0,0,368,513,1353]],[[0,0,368,513,1356]],[[0,0,368,513,1359]],[[0,0,368,513,1362]],[[0,0,368,513,1365]],[[0,0,368,513,1368]],[[0,0,368,513,1371]],[[0,0,368,513,1374]],[[0,0,368,513,1377]],[[0,0,368,513,1380]],[[0,0,368,513,1383]],[[0,0,368,513,1386]],[[0,0,368,513,1389]],[[0,0,368,513,1392]],[[0,0,368,513,1395]],[[0,0,368,513,1398]],[[0,0,368,513,1401]],[[0,0,368,513,1404]],[[0,0,368,513,1407]],[[0,0,368,513,1410]],[[0,0,368,513,1413]],[[0,0,368,513,1416]],[[0,0,368,513,1419]],[[0,0,368,513,1422]],[[0,0,368,513,1425]],[[0,0,368,513,1428]],[[0,0,368,513,1431]],[[0,0,368,513,1434]],[[0,0,368,513,1437]],[[0,0,368,513,1440]],[[0,0,368,513,1443]],[[0,0,368,513,1446]],[[0,0,368,513,1449]],[[0,0,368,513,1452]],[[0,0,368,513,1455]],[[0,0,368,513,1458]],[[0,0,368,513,1461]],[[0,0,368,513,1464]],[[0,0,368,513,1467]],[[0,0,368,513,1470]],[[0,0,368,513,1473]],[[0,0,368,513,1476]],[[0,0,368,513,1479]],[[0,0,368,513,1482]],[[0,0,368,513,1485]],[[0,0,368,513,1488]],[[0,0,368,513,1491]],[[0,0,368,513,1494]],[[0,0,368,513,1497]],[[0,0,368,513,1500]],[[0,0,368,513,1503]],[[0,0,368,513,1506]],[[0,0,368,513,1509]],[[0,0,368,513,1512]],[[0,0,368,513,1515]],[[0,0,368,513,1518]],[[0,0,368,513,1521]],[[0,0,368,513,1524]],[[0,0,368,513,1527]],[[0,0,368,513,1530]],[[0,0,368,513,1533]],[[0,0,368,513,1536]],[[0,0,368,513,1539]],[[0,0,368,513,1542]],[[0,0,368,513,1545]],[[0,0,368,513,1548]],[[0,0,368,513,1551]],[[0,0,368,513,1554]],[[0,0,368,513,1557]],[[0,0,368,513,1560]],[[0,0,368,513,1563]],[[0,0,368,513,1566]],[[0,0,368,513,1569]],[[0,0,368,513,1572]],[[0,0,368,513,1575]],[[0,0,368,513,1578]],[[0,0,368,513,1581]],[[0,0,368,513,1584]],[[0,0,368,513,1587]],[[0,0,368,513,1590]],[[0,0,368,513,1593]],[[0,0,368,513,1596]],[[0,0,368,513,1599]],[[0,0,368,513,1602]],[[0,0,368,513,1605]],[[0,0,368,513,1608]],[[0,0,368,513,1611]],[[0,0,368,513,1614]],[[0,0,368,513,1617]],[[0,0,368,513,1620]],[[0,0,368,513,1623]],[[0,0,368,513,1626]],[[0,0,368,513,1629]],[[0,0,368,513,1632]],[[0,0,368,513,1635]],[[0,0,368,513,1638]],[[0,0,368,513,1641]],[[0,0,368,513,1644]],[[0,0,368,513,1647]],[[0,0,368,513,1650]],[[0,0,368,513,1653]],[[0,0,368,513,1656]],[[0,0,368,513,1659]],[[0,0,368,513,1662]],[[0,0,368,513,1665]],[[0,0,368,513,1668]],[[0,0,368,513,1671]],[[0,0,368,513,1674]],[[0,0,368,513,1677]],[[0,0,368,513,1680]],[[0,0,368,513,1683]],[[0,0,368,513,1686]],[[0,0,368,513,1689]],[[0,0,368,513,1692]],[[0,0,368,513,1695]],[[0,0,368,513,1698]],[[0,0,368,513,1701]],[[0,0,368,513,1704]],[[0,0,368,513,1707]],[[0,0,368,513,1710]],[[0,0,368,513,1713]],[[0,0,368,513,1716]],[[0,0,368,513,1719]],[[0,0,368,513,1722]],[[0,0,368,513,1725]],[[0,0,368,513,1728]],[[0,0,368,513,1731]],[[0,0,368,513,1734]],[[0,0,368,513,1737]],[[0,0,368,513,1740]],[[0,0,368,513,1743]],[[0,0,368,513,1746]],[[0,0,368,513,1749]],[[0,0,368,513,1752]],[[0,0,368,513,1755]],[[0,0,368,513,1758]],[[0,0,368,513,1761]],[[0,0,368,513,1764]],[[0,0,368,513,1767]],[[0,0,368,513,1770]],[[0,0,368,513,1773]],[[0,0,368,513,1776]],[[0,0,368,513,1779]],[[0,0,368,513,1782]],[[0,0,368,513,1785]],[[0,0,368,513,1788]],[[0,0,368,513,1791]],[[0,0,368,513,1794]],[[0,0,368,513,1797]],[[0,0,368,513,1800]],[[0,0,368,513,1803]],[[0,0,368,513,1806]],[[0,0,368,513,1809]],[[0,0,368,513,1812]],[[0,0,368,513,1815]],[[0,0,368,513,1818]],[[0,0,368,513,1821]],[[0,0,368,513,1824]],[[0,0,368,513,1827]],[[0,0,368,513,1830]],[[0,0,368,513,1833]],[[0,0,368,513,1836]],[[0,0,368,513,1839]],[[0,0,368,513,1842]],[[0,0,368,513,1845]],[[0,0,368,513,1848]],[[0,0,368,513,1851]],[[0,0,368,513,1854]],[[0,0,368,513,1857]],[[0,0,368,513,1860]],[[0,0,368,513,1863]],[[0,0,368,513,1866]],[[0,0,368,513,1869]],[[0,0,368,513,1872]],[[0,0,368,513,1875]],[[0,0,368,513,1878]],[[0,0,368,513,1881]],[[0,0,368,513,1884]],[[0,0,368,513,1887]],[[0,0,368,513,1890]],[[0,0,368,513,1893]],[[0,0,368,513,1896]],[[0,0,368,513,1899]],[[0,0,368,513,1902]],[[0,0,368,513,1905]],[[0,0,368,513,1908]],[[0,0,368,513,1911]],[[0,0,368,513,1914]],[[0,0,368,513,1917]],[[0,0,368,513,1920]],[[0,0,368,513,1923]],[[0,0,368,513,1926]],[[0,0,368,513,1929]],[[0,0,368,513,1932]],[[0,0,368,513,1935]],[[0,0,368,513,1938]],[[0,0,368,513,1941]],[[0,0,368,513,1944]],[[0,0,368,513,1947]],[[0,0,368,513,1950]],[[0,0,368,513,1953]],[[0,0,368,513,1956]],[[0,0,368,513,1959]],[[0,0,368,513,1962]],[[0,0,368,513,1965]],[[0,0,368,513,1968]],[[0,0,368,513,1971]],[[0,0,368,513,1974]],[[0,0,368,513,1977]],[[0,0,368,513,1980]],[[0,0,368,513,1983]],[[0,0,368,513,1986]],[[0,0,368,513,1989]],[[0,0,368,513,1992]],[[0,0,368,513,1995]],[[0,0,368,513,1998]],[[0,0,368,513,2001]],[[0,0,368,513,2004]],[[0,0,368,513,2007]],[[0,0,368,513,2010]],[[0,0,368,513,2013]],[[0,0,368,513,2016]],[[0,0,368,513,2019]],[[0,0,368,513,2022]],[[0,0,368,513,2025]],[[0,0,368,513,2028]],[[0,0,368,513,2031]],[[0,0,368,513,2034]],[[0,0,368,513,2037]],[[0,0,368,513,2040]],[[0,0,368,513,2043]],[[0,0,368,513,2046]],[[0,0,368,513,2049]],[[0,0,368,513,2052]],[[0,0,368,513,2055]],[[0,0,368,513,2058]],[[0,0,368,513,2061]],[[0,0,368,513,2064]],[[0,0,368,513,2067]],[[0,0,368,513,2070]],[[0,0,368,513,2073]],[[0,0,368,513,2076]],[[0,0,368,513,2079]],[[0,0,368,513,2082]],[[0,0,368,513,2085]],[[0,0,368,513,2088]],[[0,0,368,513,2091]],[[0,0,368,513,2094]],[[0,0,368,513,2097]],[[0,0,368,513,2100]],[[0,0,368,513,2103]],[[0,0,368,513,2106]],[[0,0,368,513,2109]],[[0,0,368,513,2112]],[[0,0,368,513,2115]],[[0,0,368,513,2118]],[[0,0,368,513,2121]],[[0,0,368,513,2124]],[[0,0,368,513,2127]],[[0,0,368,513,2130]],[[0,0,368,513,2133]],[[0,0,368,513,2136]],[[0,0,368,513,2139]],[[0,0,368,513,2142]],[[0,0,368,513,2145]],[[0,0,368,513,2148]],[[0,0,368,513,2151]],[[0,0,368,513,2154]],[[0,0,368,513,2157]],[[0,0,368,513,2160]],[[0,0,368,513,2163]],[[0,0,368,513,2166]],[[0,0,368,513,2169]],[[0,0,368,513,2172]],[[0,0,368,513,2175]],[[0,0,368,513,2178]],[[0,0,368,513,2181]],[[0,0,368,513,2184]],[[0,0,368,513,2187]],[[0,0,368,513,2190]],[[0,0,368,513,2193]],[[0,0,368,513,2196]],[[0,0,368,513,2199]],[[0,0,368,513,2202]],[[0,0,368,513,2205]],[[0,0,368,513,2208]],[[0,0,368,513,2211]],[[0,0,368,513,2214]],[[0,0,368,513,2217]],[[0,0,368,513,2220]],[[0,0,368,513,2223]],[[0,0,368,513,2226]],[[0,0,368,513,2229]],[[0,0,368,513,2232]],[[0,0,368,513,2235]],[[0,0,368,513,2238]],[[0,0,368,513,2241]],[[0,0,368,513,2244]],[[0,0,368,513,2247]],[[0,0,368,513,2250]],[[0,0,368,513,2253]],[[0,0,368,513,2256]],[[0,0,368,513,2259]],[[0,0,368,513,2262]],[[0,0,368,513,2265]],[[0,0,368,513,2268]],[[0,0,368,513,2271]],[[0,0,368,513,2274]],[[0,0,368,513,2277]],[[0,0,368,513,2280]],[[0,0,368,513,2283]],[[0,0,368,513,2286]],[[0,0,368,513,2289]],[[0,0,368,513,2292]],[[0,0,368,513,2295]],[[0,0,368,513,2298]],[[0,0,368,513,2301]],[[0,0,368,513,2304]],[[0,0,368,513,2307]],[[0,0,368,513,2310]],[[0,0,368,513,2313]],[[0,0,368,513,2316]],[[0,0,368,513,2319]],[[0,0,368,513,2322]],[[0,0,368,513,2325]],[[0,0,368,513,2328]],[[0,0,368,513,2331]],[[0,0,368,513,2334]],[[0,0,368,513,2337]],[[0,0,368,513,2340]],[[0,0,368,513,2343]],[[0,0,368,513,2346]],[[0,0,368,513,2349]],[[0,0,368,513,2352]],[[0,0,368,513,2355]],[[0,0,368,513,2358]],[[0,0,368,513,2361]],[[0,0,368,513,2364]],[[0,0,368,513,2367]],[[0,0,368,513,2370]],[[0,0,368,513,2373]],[[0,0,368,513,2376]],[[0,0,368,513,2379]],[[0,0,368,513,2382]],[[0,0,368,513,2385]],[[0,0,368,513,2388]],[[0,0,368,513,2391]],[[0,0,368,513,2394]],[[0,0,368,513,2397]],[[0,0,368,513,2400]],[[0,0,368,513,2403]],[[0,0,368,513,2406]],[[0,0,368,513,2409]],[[0,0,368,513,2412]],[[0,0,368,513,2415]],[[0,0,368,513,2418]],[[0,0,368,513,2421]],[[0,0,368,513,2424]],[[0,0,368,513,2427]],[[0,0,368,513,2430]],[[0,0,368,513,2433]],[[0,0,368,513,2436]],[[0,0,368,513,2439]],[[0,0,368,513,2442]],[[0,0,368,513,2445]],[[0,0,368,513,2448]],[[0,0,368,513,2451]],[[0,0,368,513,2454]],[[0,0,368,513,2457]],[[0,0,368,513,2460]],[[0,0,368,513,2463]],[[0,0,368,513,2466]],[[0,0,368,513,2469]],[[0,0,368,513,2472]],[[0,0,368,513,2475]],[[0,0,368,513,2478]],[[0,0,368,513,2481]],[[0,0,368,513,2484]],[[0,0,368,513,2487]],[[0,0,368,513,2490]],[[0,0,368,513,2493]],[[0,0,368,513,2496]],[[0,0,368,513,2499]],[[0,0,368,513,2502]],[[0,0,368,513,2505]],[[0,0,368,513,2508]],[[0,0,368,513,2511]],[[0,0,368,513,2514]],[[0,0,368,513,2517]],[[0,0,368,513,2520]],[[0,0,368,513,2523]],[[0,0,368,513,2526]],[[0,0,368,513,2529]],[[0,0,368,513,2532]],[[0,0,368,513,2535]],[[0,0,368,513,2538]],[[0,0,368,513,2541]],[[0,0,368,513,2544]],[[0,0,368,513,2547]],[[0,0,368,513,2550]],[[0,0,368,513,2553]],[[0,0,368,513,2556]],[[0,0,368,513,2559]],[[0,0,368,513,2562]],[[0,0,368,513,2565]],[[0,0,368,513,2568]],[[0,0,368,513,2571]],[[0,0,368,513,2574]],[[0,0,368,513,2577]],[[0,0,368,513,2580]],[[0,0,368,513,2583]],[[0,0,368,513,2586]],[[0,0,368,513,2589]],[[0,0,368,513,2592]],[[0,0,368,513,2595]],[[0,0,368,513,2598]],[[0,0,368,513,2601]],[[0,0,368,513,2604]],[[0,0,368,513,2607]],[[0,0,368,513,2610]],[[0,0,368,513,2613]],[[0,0,368,513,2616]],[[0,0,368,513,2619]],[[0,0,368,513,2622]],[[0,0,368,513,2625]],[[0,0,368,513,2628]],[[0,0,368,513,2631]],[[0,0,368,513,2634]],[[0,0,368,513,2637]],[[0,0,368,513,2640]],[[0,0,368,513,2643]],[[0,0,368,513,2646]],[[0,0,368,513,2649]],[[0,0,368,513,2652]],[[0,0,368,513,2655]],[[0,0,368,513,2658]],[[0,0,368,513,2661]],[[0,0,368,513,2664]],[[0,0,368,513,2667]],[[0,0,368,513,2670]],[[0,0,368,513,2673]],[[0,0,368,513,2676]],[[0,0,368,513,2679]],[[0,0,368,513,2682]],[[0,0,368,513,2685]],[[0,0,368,513,2688]],[[0,0,368,513,2691]],[[0,0,368,513,2694]],[[0,0,368,513,2697]],[[0,0,368,513,2700]],[[0,0,368,513,2703]],[[0,0,368,513,2706]],[[0,0,368,513,2709]],[[0,0,368,513,2712]],[[0,0,368,513,2715]],[[0,0,368,513,2718]],[[0,0,368,513,2721]],[[0,0,368,513,2724]],[[0,0,368,513,2727]],[[0,0,368,513,2730]],[[0,0,368,513,2733]],[[0,0,368,513,2736]],[[0,0,368,513,2739]],[[0,0,368,513,2742]],[[0,0,368,513,2745]],[[0,0,368,513,2748]],[[0,0,368,513,2751]],[[0,0,368,513,2754]],[[0,0,368,513,2757]],[[0,0,368,513,2760]],[[0,0,368,513,2763]],[[0,0,368,513,2766]],[[0,0,368,513,2769]],[[0,0,368,513,2772]],[[0,0,368,513,2775]],[[0,0,368,513,2778]],[[0,0,368,513,2781]],[[0,0,368,513,2784]],[[0,0,368,513,2787]],[[0,0,368,513,2790]],[[0,0,368,513,2793]],[[0,0,368,513,2796]]],"text_len_per_page":[53,53,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54],"metadata":{"format":"PDF 1.6","title":"","author":"","subject":"","keywords":"","creator":"Adobe Acrobat 7.0","producer":"Adobe Acrobat 7.0 Image Conversion Plug-in","creationDate":"D:20080404141457+01'00'","modDate":"D:20080404144821+01'00'","trapped":"","encryption":null}} - # o = json.loads(json.dumps(o)) - # total_page = o["total_page"] - # page_width = o["page_width_pts"] - # page_height = o["page_height_pts"] - # img_sz_list = o["image_info_per_page"] - # text_len_list = o['text_len_per_page'] - # pdf_path = o['pdf_path'] - # is_encrypted = o['is_encrypted'] - # is_needs_password = o['is_needs_password'] - # if is_encrypted or total_page == 0 or is_needs_password: # 加密的,需要密码的,没有页面的,都不处理 - # print("加密的") - # exit(0) - # tag = classify(pdf_path, total_page, page_width, page_height, img_sz_list, text_len_list) - # o['is_text_pdf'] = tag - # print(json.dumps(o, ensure_ascii=False)) diff --git a/magic_pdf/filter/pdf_meta_scan.py b/magic_pdf/filter/pdf_meta_scan.py deleted file mode 100644 index 67e56315057299b4888bc4058f057d857c0b3dc8..0000000000000000000000000000000000000000 --- a/magic_pdf/filter/pdf_meta_scan.py +++ /dev/null @@ -1,397 +0,0 @@ -"""输入: s3路径,每行一个 输出: pdf文件元信息,包括每一页上的所有图片的长宽高,bbox位置.""" - -from collections import Counter - -import fitz -from loguru import logger - -from magic_pdf.config.drop_reason import DropReason -from magic_pdf.libs.commons import get_top_percent_list, mymax -from magic_pdf.libs.language import detect_lang -from magic_pdf.libs.pdf_check import detect_invalid_chars_by_pymupdf, detect_invalid_chars - -scan_max_page = 50 -junk_limit_min = 10 - - -def calculate_max_image_area_per_page(result: list, page_width_pts, page_height_pts): - max_image_area_per_page = [ - mymax([(x1 - x0) * (y1 - y0) for x0, y0, x1, y1, _ in page_img_sz]) - for page_img_sz in result - ] - page_area = int(page_width_pts) * int(page_height_pts) - max_image_area_per_page = [area / page_area for area in max_image_area_per_page] - max_image_area_per_page = [area for area in max_image_area_per_page if area > 0.6] - return max_image_area_per_page - - -def process_image(page, junk_img_bojids=[]): - page_result = [] # 存每个页面里的多张图四元组信息 - items = page.get_images() - dedup = set() - for img in items: - # 这里返回的是图片在page上的实际展示的大小。返回一个数组,每个元素第一部分是 - img_bojid = img[ - 0 - ] # 在pdf文件中是全局唯一的,如果这个图反复出现在pdf里那么就可能是垃圾信息,例如水印、页眉页脚等 - if img_bojid in junk_img_bojids: # 如果是垃圾图像,就跳过 - continue - recs = page.get_image_rects(img, transform=True) - if recs: - rec = recs[0][0] - x0, y0, x1, y1 = map(int, rec) - width = x1 - x0 - height = y1 - y0 - if ( - x0, - y0, - x1, - y1, - img_bojid, - ) in dedup: # 这里面会出现一些重复的bbox,无需重复出现,需要去掉 - continue - if not all( - [width, height] - ): # 长和宽任何一个都不能是0,否则这个图片不可见,没有实际意义 - continue - dedup.add((x0, y0, x1, y1, img_bojid)) - page_result.append([x0, y0, x1, y1, img_bojid]) - return page_result - - -def get_image_info(doc: fitz.Document, page_width_pts, page_height_pts) -> list: - """返回每个页面里的图片的四元组,每个页面多个图片。 - - :param doc: - :return: - """ - # 使用 Counter 计数 img_bojid 的出现次数 - img_bojid_counter = Counter(img[0] for page in doc for img in page.get_images()) - # 找出出现次数超过 len(doc) 半数的 img_bojid - - junk_limit = max(len(doc) * 0.5, junk_limit_min) # 对一些页数比较少的进行豁免 - - junk_img_bojids = [ - img_bojid - for img_bojid, count in img_bojid_counter.items() - if count >= junk_limit - ] - - # todo 加个判断,用前十页就行,这些垃圾图片需要满足两个条件,不止出现的次数要足够多,而且图片占书页面积的比例要足够大,且图与图大小都差不多 - # 有两种扫描版,一种文字版,这里可能会有误判 - # 扫描版1:每页都有所有扫描页图片,特点是图占比大,每页展示1张 - # 扫描版2,每页存储的扫描页图片数量递增,特点是图占比大,每页展示1张,需要清空junklist跑前50页图片信息用于分类判断 - # 文 字版1.每页存储所有图片,特点是图片占页面比例不大,每页展示可能为0也可能不止1张 这种pdf需要拿前10页抽样检测img大小和个数,如果符合需要清空junklist - imgs_len_list = [len(page.get_images()) for page in doc] - - special_limit_pages = 10 - - # 统一用前十页结果做判断 - result = [] - break_loop = False - for i, page in enumerate(doc): - if break_loop: - break - if i >= special_limit_pages: - break - page_result = process_image( - page - ) # 这里不传junk_img_bojids,拿前十页所有图片信息用于后续分析 - result.append(page_result) - for item in result: - if not any( - item - ): # 如果任何一页没有图片,说明是个文字版,需要判断是否为特殊文字版 - if ( - max(imgs_len_list) == min(imgs_len_list) - and max(imgs_len_list) >= junk_limit_min - ): # 如果是特殊文字版,就把junklist置空并break - junk_img_bojids = [] - else: # 不是特殊文字版,是个普通文字版,但是存在垃圾图片,不置空junklist - pass - break_loop = True - break - if not break_loop: - # 获取前80%的元素 - top_eighty_percent = get_top_percent_list(imgs_len_list, 0.8) - # 检查前80%的元素是否都相等 - if len(set(top_eighty_percent)) == 1 and max(imgs_len_list) >= junk_limit_min: - # # 如果前10页跑完都有图,根据每页图片数量是否相等判断是否需要清除junklist - # if max(imgs_len_list) == min(imgs_len_list) and max(imgs_len_list) >= junk_limit_min: - - # 前10页都有图,且每页数量一致,需要检测图片大小占页面的比例判断是否需要清除junklist - max_image_area_per_page = calculate_max_image_area_per_page( - result, page_width_pts, page_height_pts - ) - if ( - len(max_image_area_per_page) < 0.8 * special_limit_pages - ): # 前10页不全是大图,说明可能是个文字版pdf,把垃圾图片list置空 - junk_img_bojids = [] - else: # 前10页都有图,而且80%都是大图,且每页图片数量一致并都很多,说明是扫描版1,不需要清空junklist - pass - else: # 每页图片数量不一致,需要清掉junklist全量跑前50页图片 - junk_img_bojids = [] - - # 正式进入取前50页图片的信息流程 - result = [] - for i, page in enumerate(doc): - if i >= scan_max_page: - break - page_result = process_image(page, junk_img_bojids) - # logger.info(f"page {i} img_len: {len(page_result)}") - result.append(page_result) - - return result, junk_img_bojids - - -def get_pdf_page_size_pts(doc: fitz.Document): - page_cnt = len(doc) - l: int = min(page_cnt, 50) - # 把所有宽度和高度塞到两个list 分别取中位数(中间遇到了个在纵页里塞横页的pdf,导致宽高互换了) - page_width_list = [] - page_height_list = [] - for i in range(l): - page = doc[i] - page_rect = page.rect - page_width_list.append(page_rect.width) - page_height_list.append(page_rect.height) - - page_width_list.sort() - page_height_list.sort() - - median_width = page_width_list[len(page_width_list) // 2] - median_height = page_height_list[len(page_height_list) // 2] - - return median_width, median_height - - -def get_pdf_textlen_per_page(doc: fitz.Document): - text_len_lst = [] - for page in doc: - # 拿包含img和text的所有blocks - # text_block = page.get_text("blocks") - # 拿所有text的blocks - # text_block = page.get_text("words") - # text_block_len = sum([len(t[4]) for t in text_block]) - # 拿所有text的str - text_block = page.get_text('text') - text_block_len = len(text_block) - # logger.info(f"page {page.number} text_block_len: {text_block_len}") - text_len_lst.append(text_block_len) - - return text_len_lst - - -def get_pdf_text_layout_per_page(doc: fitz.Document): - """根据PDF文档的每一页文本布局,判断该页的文本布局是横向、纵向还是未知。 - - Args: - doc (fitz.Document): PDF文档对象。 - - Returns: - List[str]: 每一页的文本布局(横向、纵向、未知)。 - """ - text_layout_list = [] - - for page_id, page in enumerate(doc): - if page_id >= scan_max_page: - break - # 创建每一页的纵向和横向的文本行数计数器 - vertical_count = 0 - horizontal_count = 0 - text_dict = page.get_text('dict') - if 'blocks' in text_dict: - for block in text_dict['blocks']: - if 'lines' in block: - for line in block['lines']: - # 获取line的bbox顶点坐标 - x0, y0, x1, y1 = line['bbox'] - # 计算bbox的宽高 - width = x1 - x0 - height = y1 - y0 - # 计算bbox的面积 - area = width * height - font_sizes = [] - for span in line['spans']: - if 'size' in span: - font_sizes.append(span['size']) - if len(font_sizes) > 0: - average_font_size = sum(font_sizes) / len(font_sizes) - else: - average_font_size = ( - 10 # 有的line拿不到font_size,先定一个阈值100 - ) - if ( - area <= average_font_size**2 - ): # 判断bbox的面积是否小于平均字体大小的平方,单字无法计算是横向还是纵向 - continue - else: - if 'wmode' in line: # 通过wmode判断文本方向 - if line['wmode'] == 1: # 判断是否为竖向文本 - vertical_count += 1 - elif line['wmode'] == 0: # 判断是否为横向文本 - horizontal_count += 1 - # if 'dir' in line: # 通过旋转角度计算判断文本方向 - # # 获取行的 "dir" 值 - # dir_value = line['dir'] - # cosine, sine = dir_value - # # 计算角度 - # angle = math.degrees(math.acos(cosine)) - # - # # 判断是否为横向文本 - # if abs(angle - 0) < 0.01 or abs(angle - 180) < 0.01: - # # line_text = ' '.join(span['text'] for span in line['spans']) - # # print('This line is horizontal:', line_text) - # horizontal_count += 1 - # # 判断是否为纵向文本 - # elif abs(angle - 90) < 0.01 or abs(angle - 270) < 0.01: - # # line_text = ' '.join(span['text'] for span in line['spans']) - # # print('This line is vertical:', line_text) - # vertical_count += 1 - # print(f"page_id: {page_id}, vertical_count: {vertical_count}, horizontal_count: {horizontal_count}") - # 判断每一页的文本布局 - if vertical_count == 0 and horizontal_count == 0: # 该页没有文本,无法判断 - text_layout_list.append('unknow') - continue - else: - if vertical_count > horizontal_count: # 该页的文本纵向行数大于横向的 - text_layout_list.append('vertical') - else: # 该页的文本横向行数大于纵向的 - text_layout_list.append('horizontal') - # logger.info(f"page_id: {page_id}, vertical_count: {vertical_count}, horizontal_count: {horizontal_count}") - return text_layout_list - - -"""定义一个自定义异常用来抛出单页svg太多的pdf""" - - -class PageSvgsTooManyError(Exception): - def __init__(self, message='Page SVGs are too many'): - self.message = message - super().__init__(self.message) - - -def get_svgs_per_page(doc: fitz.Document): - svgs_len_list = [] - for page_id, page in enumerate(doc): - # svgs = page.get_drawings() - svgs = page.get_cdrawings() # 切换成get_cdrawings,效率更高 - len_svgs = len(svgs) - if len_svgs >= 3000: - raise PageSvgsTooManyError() - else: - svgs_len_list.append(len_svgs) - # logger.info(f"page_id: {page_id}, svgs_len: {len(svgs)}") - return svgs_len_list - - -def get_imgs_per_page(doc: fitz.Document): - imgs_len_list = [] - for page_id, page in enumerate(doc): - imgs = page.get_images() - imgs_len_list.append(len(imgs)) - # logger.info(f"page_id: {page}, imgs_len: {len(imgs)}") - - return imgs_len_list - - -def get_language(doc: fitz.Document): - """ - 获取PDF文档的语言。 - Args: - doc (fitz.Document): PDF文档对象。 - Returns: - str: 文档语言,如 "en-US"。 - """ - language_lst = [] - for page_id, page in enumerate(doc): - if page_id >= scan_max_page: - break - # 拿所有text的str - text_block = page.get_text('text') - page_language = detect_lang(text_block) - language_lst.append(page_language) - - # logger.info(f"page_id: {page_id}, page_language: {page_language}") - - # 统计text_language_list中每种语言的个数 - count_dict = Counter(language_lst) - # 输出text_language_list中出现的次数最多的语言 - language = max(count_dict, key=count_dict.get) - return language - - -def check_invalid_chars(pdf_bytes): - """乱码检测.""" - # return detect_invalid_chars_by_pymupdf(pdf_bytes) - return detect_invalid_chars(pdf_bytes) - - -def pdf_meta_scan(pdf_bytes: bytes): - """ - :param s3_pdf_path: - :param pdf_bytes: pdf文件的二进制数据 - 几个维度来评价:是否加密,是否需要密码,纸张大小,总页数,是否文字可提取 - """ - doc = fitz.open('pdf', pdf_bytes) - is_needs_password = doc.needs_pass - is_encrypted = doc.is_encrypted - total_page = len(doc) - if total_page == 0: - logger.warning(f'drop this pdf, drop_reason: {DropReason.EMPTY_PDF}') - result = {'_need_drop': True, '_drop_reason': DropReason.EMPTY_PDF} - return result - else: - page_width_pts, page_height_pts = get_pdf_page_size_pts(doc) - # logger.info(f"page_width_pts: {page_width_pts}, page_height_pts: {page_height_pts}") - - # svgs_per_page = get_svgs_per_page(doc) - # logger.info(f"svgs_per_page: {svgs_per_page}") - imgs_per_page = get_imgs_per_page(doc) - # logger.info(f"imgs_per_page: {imgs_per_page}") - - image_info_per_page, junk_img_bojids = get_image_info( - doc, page_width_pts, page_height_pts - ) - # logger.info(f"image_info_per_page: {image_info_per_page}, junk_img_bojids: {junk_img_bojids}") - text_len_per_page = get_pdf_textlen_per_page(doc) - # logger.info(f"text_len_per_page: {text_len_per_page}") - # text_layout_per_page = get_pdf_text_layout_per_page(doc) - # logger.info(f"text_layout_per_page: {text_layout_per_page}") - # text_language = get_language(doc) - # logger.info(f"text_language: {text_language}") - invalid_chars = check_invalid_chars(pdf_bytes) - # logger.info(f"invalid_chars: {invalid_chars}") - - # 最后输出一条json - res = { - 'is_needs_password': is_needs_password, - 'is_encrypted': is_encrypted, - 'total_page': total_page, - 'page_width_pts': int(page_width_pts), - 'page_height_pts': int(page_height_pts), - 'image_info_per_page': image_info_per_page, - 'text_len_per_page': text_len_per_page, - # 'text_layout_per_page': text_layout_per_page, - # 'text_language': text_language, - # "svgs_per_page": svgs_per_page, - 'imgs_per_page': imgs_per_page, # 增加每页img数量list - 'junk_img_bojids': junk_img_bojids, # 增加垃圾图片的bojid list - 'invalid_chars': invalid_chars, - 'metadata': doc.metadata, - } - # logger.info(json.dumps(res, ensure_ascii=False)) - return res - - -if __name__ == '__main__': - pass - # "D:\project/20231108code-clean\pdf_cost_time\竖排例子\净空法师-大乘无量寿.pdf" - # "D:\project/20231108code-clean\pdf_cost_time\竖排例子\三国演义_繁体竖排版.pdf" - # "D:\project/20231108code-clean\pdf_cost_time\scihub\scihub_86800000\libgen.scimag86880000-86880999.zip_10.1021/acsami.1c03109.s002.pdf" - # "D:/project/20231108code-clean/pdf_cost_time/scihub/scihub_18600000/libgen.scimag18645000-18645999.zip_10.1021/om3006239.pdf" - # file_content = read_file("D:/project/20231108code-clean/pdf_cost_time/scihub/scihub_31000000/libgen.scimag31098000-31098999.zip_10.1109/isit.2006.261791.pdf","") # noqa: E501 - # file_content = read_file("D:\project/20231108code-clean\pdf_cost_time\竖排例子\净空法师_大乘无量寿.pdf","") - # doc = fitz.open("pdf", file_content) - # text_layout_lst = get_pdf_text_layout_per_page(doc) - # print(text_layout_lst) diff --git a/magic_pdf/integrations/__init__.py b/magic_pdf/integrations/__init__.py deleted file mode 100644 index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000 diff --git a/magic_pdf/integrations/rag/__init__.py b/magic_pdf/integrations/rag/__init__.py deleted file mode 100644 index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000 diff --git a/magic_pdf/integrations/rag/api.py b/magic_pdf/integrations/rag/api.py deleted file mode 100644 index 5c05f91169dad911b147a4f9c518af26a419b449..0000000000000000000000000000000000000000 --- a/magic_pdf/integrations/rag/api.py +++ /dev/null @@ -1,82 +0,0 @@ -import os -from pathlib import Path - -from loguru import logger - -from magic_pdf.integrations.rag.type import (ElementRelation, LayoutElements, - Node) -from magic_pdf.integrations.rag.utils import inference - - -class RagPageReader: - - def __init__(self, pagedata: LayoutElements): - self.o = [ - Node( - category_type=v.category_type, - text=v.text, - image_path=v.image_path, - anno_id=v.anno_id, - latex=v.latex, - html=v.html, - ) for v in pagedata.layout_dets - ] - - self.pagedata = pagedata - - def __iter__(self): - return iter(self.o) - - def get_rel_map(self) -> list[ElementRelation]: - return self.pagedata.extra.element_relation - - -class RagDocumentReader: - - def __init__(self, ragdata: list[LayoutElements]): - self.o = [RagPageReader(v) for v in ragdata] - - def __iter__(self): - return iter(self.o) - - -class DataReader: - - def __init__(self, path_or_directory: str, method: str, output_dir: str): - self.path_or_directory = path_or_directory - self.method = method - self.output_dir = output_dir - self.pdfs = [] - if os.path.isdir(path_or_directory): - for doc_path in Path(path_or_directory).glob('*.pdf'): - self.pdfs.append(doc_path) - else: - assert path_or_directory.endswith('.pdf') - self.pdfs.append(Path(path_or_directory)) - - def get_documents_count(self) -> int: - """Returns the number of documents in the directory.""" - return len(self.pdfs) - - def get_document_result(self, idx: int) -> RagDocumentReader | None: - """ - Args: - idx (int): the index of documents under the - directory path_or_directory - - Returns: - RagDocumentReader | None: RagDocumentReader is an iterable object, - more details @RagDocumentReader - """ - if idx >= self.get_documents_count() or idx < 0: - logger.error(f'invalid idx: {idx}') - return None - res = inference(str(self.pdfs[idx]), self.output_dir, self.method) - if res is None: - logger.warning(f'failed to inference pdf {self.pdfs[idx]}') - return None - return RagDocumentReader(res) - - def get_document_filename(self, idx: int) -> Path: - """get the filename of the document.""" - return self.pdfs[idx] diff --git a/magic_pdf/integrations/rag/type.py b/magic_pdf/integrations/rag/type.py deleted file mode 100644 index 11258af39487f3084a900d44c5bc4eb364ef2230..0000000000000000000000000000000000000000 --- a/magic_pdf/integrations/rag/type.py +++ /dev/null @@ -1,82 +0,0 @@ -from enum import Enum - -from pydantic import BaseModel, Field - - -# rag -class CategoryType(Enum): # py310 not support StrEnum - text = 'text' - title = 'title' - interline_equation = 'interline_equation' - image = 'image' - image_body = 'image_body' - image_caption = 'image_caption' - table = 'table' - table_body = 'table_body' - table_caption = 'table_caption' - table_footnote = 'table_footnote' - - -class ElementRelType(Enum): - sibling = 'sibling' - - -class PageInfo(BaseModel): - page_no: int = Field(description='the index of page, start from zero', - ge=0) - height: int = Field(description='the height of page', gt=0) - width: int = Field(description='the width of page', ge=0) - image_path: str | None = Field(description='the image of this page', - default=None) - - -class ContentObject(BaseModel): - category_type: CategoryType = Field(description='类别') - poly: list[float] = Field( - description=('Coordinates, need to convert back to PDF coordinates,' - ' order is top-left, top-right, bottom-right, bottom-left' - ' x,y coordinates')) - ignore: bool = Field(description='whether ignore this object', - default=False) - text: str | None = Field(description='text content of the object', - default=None) - image_path: str | None = Field(description='path of embedded image', - default=None) - order: int = Field(description='the order of this object within a page', - default=-1) - anno_id: int = Field(description='unique id', default=-1) - latex: str | None = Field(description='latex result', default=None) - html: str | None = Field(description='html result', default=None) - - -class ElementRelation(BaseModel): - source_anno_id: int = Field(description='unique id of the source object', - default=-1) - target_anno_id: int = Field(description='unique id of the target object', - default=-1) - relation: ElementRelType = Field( - description='the relation between source and target element') - - -class LayoutElementsExtra(BaseModel): - element_relation: list[ElementRelation] = Field( - description='the relation between source and target element') - - -class LayoutElements(BaseModel): - layout_dets: list[ContentObject] = Field( - description='layout element details') - page_info: PageInfo = Field(description='page info') - extra: LayoutElementsExtra = Field(description='extra information') - - -# iter data format -class Node(BaseModel): - category_type: CategoryType = Field(description='类别') - text: str | None = Field(description='text content of the object', - default=None) - image_path: str | None = Field(description='path of embedded image', - default=None) - anno_id: int = Field(description='unique id', default=-1) - latex: str | None = Field(description='latex result', default=None) - html: str | None = Field(description='html result', default=None) diff --git a/magic_pdf/integrations/rag/utils.py b/magic_pdf/integrations/rag/utils.py deleted file mode 100644 index 49e9dc0ee2a6955219012dd05aa58d6e56b1f25d..0000000000000000000000000000000000000000 --- a/magic_pdf/integrations/rag/utils.py +++ /dev/null @@ -1,284 +0,0 @@ -import json -import os -from pathlib import Path - -from loguru import logger - -import magic_pdf.model as model_config -from magic_pdf.config.ocr_content_type import BlockType, ContentType -from magic_pdf.data.data_reader_writer import FileBasedDataReader -from magic_pdf.dict2md.ocr_mkcontent import merge_para_with_text -from magic_pdf.integrations.rag.type import (CategoryType, ContentObject, - ElementRelation, ElementRelType, - LayoutElements, - LayoutElementsExtra, PageInfo) -from magic_pdf.tools.common import do_parse, prepare_env - - -def convert_middle_json_to_layout_elements( - json_data: dict, - output_dir: str, -) -> list[LayoutElements]: - uniq_anno_id = 0 - - res: list[LayoutElements] = [] - for page_no, page_data in enumerate(json_data['pdf_info']): - order_id = 0 - page_info = PageInfo( - height=int(page_data['page_size'][1]), - width=int(page_data['page_size'][0]), - page_no=page_no, - ) - layout_dets: list[ContentObject] = [] - extra_element_relation: list[ElementRelation] = [] - - for para_block in page_data['para_blocks']: - para_text = '' - para_type = para_block['type'] - - if para_type == BlockType.Text: - para_text = merge_para_with_text(para_block) - x0, y0, x1, y1 = para_block['bbox'] - content = ContentObject( - anno_id=uniq_anno_id, - category_type=CategoryType.text, - text=para_text, - order=order_id, - poly=[x0, y0, x1, y0, x1, y1, x0, y1], - ) - uniq_anno_id += 1 - order_id += 1 - layout_dets.append(content) - - elif para_type == BlockType.Title: - para_text = merge_para_with_text(para_block) - x0, y0, x1, y1 = para_block['bbox'] - content = ContentObject( - anno_id=uniq_anno_id, - category_type=CategoryType.title, - text=para_text, - order=order_id, - poly=[x0, y0, x1, y0, x1, y1, x0, y1], - ) - uniq_anno_id += 1 - order_id += 1 - layout_dets.append(content) - - elif para_type == BlockType.InterlineEquation: - para_text = merge_para_with_text(para_block) - x0, y0, x1, y1 = para_block['bbox'] - content = ContentObject( - anno_id=uniq_anno_id, - category_type=CategoryType.interline_equation, - text=para_text, - order=order_id, - poly=[x0, y0, x1, y0, x1, y1, x0, y1], - ) - uniq_anno_id += 1 - order_id += 1 - layout_dets.append(content) - - elif para_type == BlockType.Image: - body_anno_id = -1 - caption_anno_id = -1 - - for block in para_block['blocks']: - if block['type'] == BlockType.ImageBody: - for line in block['lines']: - for span in line['spans']: - if span['type'] == ContentType.Image: - x0, y0, x1, y1 = block['bbox'] - content = ContentObject( - anno_id=uniq_anno_id, - category_type=CategoryType.image_body, - image_path=os.path.join( - output_dir, span['image_path']), - order=order_id, - poly=[x0, y0, x1, y0, x1, y1, x0, y1], - ) - body_anno_id = uniq_anno_id - uniq_anno_id += 1 - order_id += 1 - layout_dets.append(content) - - for block in para_block['blocks']: - if block['type'] == BlockType.ImageCaption: - para_text += merge_para_with_text(block) - x0, y0, x1, y1 = block['bbox'] - content = ContentObject( - anno_id=uniq_anno_id, - category_type=CategoryType.image_caption, - text=para_text, - order=order_id, - poly=[x0, y0, x1, y0, x1, y1, x0, y1], - ) - caption_anno_id = uniq_anno_id - uniq_anno_id += 1 - order_id += 1 - layout_dets.append(content) - - if body_anno_id > 0 and caption_anno_id > 0: - element_relation = ElementRelation( - relation=ElementRelType.sibling, - source_anno_id=body_anno_id, - target_anno_id=caption_anno_id, - ) - extra_element_relation.append(element_relation) - - elif para_type == BlockType.Table: - body_anno_id, caption_anno_id, footnote_anno_id = -1, -1, -1 - - for block in para_block['blocks']: - if block['type'] == BlockType.TableCaption: - para_text += merge_para_with_text(block) - x0, y0, x1, y1 = block['bbox'] - content = ContentObject( - anno_id=uniq_anno_id, - category_type=CategoryType.table_caption, - text=para_text, - order=order_id, - poly=[x0, y0, x1, y0, x1, y1, x0, y1], - ) - caption_anno_id = uniq_anno_id - uniq_anno_id += 1 - order_id += 1 - layout_dets.append(content) - - for block in para_block['blocks']: - if block['type'] == BlockType.TableBody: - for line in block['lines']: - for span in line['spans']: - if span['type'] == ContentType.Table: - x0, y0, x1, y1 = para_block['bbox'] - content = ContentObject( - anno_id=uniq_anno_id, - category_type=CategoryType.table_body, - order=order_id, - poly=[x0, y0, x1, y0, x1, y1, x0, y1], - ) - body_anno_id = uniq_anno_id - uniq_anno_id += 1 - order_id += 1 - # if processed by table model - if span.get('latex', ''): - content.latex = span['latex'] - else: - content.image_path = os.path.join( - output_dir, span['image_path']) - layout_dets.append(content) - - for block in para_block['blocks']: - if block['type'] == BlockType.TableFootnote: - para_text += merge_para_with_text(block) - x0, y0, x1, y1 = block['bbox'] - content = ContentObject( - anno_id=uniq_anno_id, - category_type=CategoryType.table_footnote, - text=para_text, - order=order_id, - poly=[x0, y0, x1, y0, x1, y1, x0, y1], - ) - footnote_anno_id = uniq_anno_id - uniq_anno_id += 1 - order_id += 1 - layout_dets.append(content) - - if caption_anno_id != -1 and body_anno_id != -1: - element_relation = ElementRelation( - relation=ElementRelType.sibling, - source_anno_id=body_anno_id, - target_anno_id=caption_anno_id, - ) - extra_element_relation.append(element_relation) - - if footnote_anno_id != -1 and body_anno_id != -1: - element_relation = ElementRelation( - relation=ElementRelType.sibling, - source_anno_id=body_anno_id, - target_anno_id=footnote_anno_id, - ) - extra_element_relation.append(element_relation) - - res.append( - LayoutElements( - page_info=page_info, - layout_dets=layout_dets, - extra=LayoutElementsExtra( - element_relation=extra_element_relation), - )) - - return res - - -def inference(path, output_dir, method): - model_config.__use_inside_model__ = True - model_config.__model_mode__ = 'full' - if output_dir == '': - if os.path.isdir(path): - output_dir = os.path.join(path, 'output') - else: - output_dir = os.path.join(os.path.dirname(path), 'output') - - local_image_dir, local_md_dir = prepare_env(output_dir, - str(Path(path).stem), method) - - def read_fn(path): - disk_rw = FileBasedDataReader(os.path.dirname(path)) - return disk_rw.read(os.path.basename(path)) - - def parse_doc(doc_path: str): - try: - file_name = str(Path(doc_path).stem) - pdf_data = read_fn(doc_path) - do_parse( - output_dir, - file_name, - pdf_data, - [], - method, - False, - f_draw_span_bbox=False, - f_draw_layout_bbox=False, - f_dump_md=False, - f_dump_middle_json=True, - f_dump_model_json=False, - f_dump_orig_pdf=False, - f_dump_content_list=False, - f_draw_model_bbox=False, - ) - - middle_json_fn = os.path.join(local_md_dir, - f'{file_name}_middle.json') - with open(middle_json_fn) as fd: - jso = json.load(fd) - os.remove(middle_json_fn) - return convert_middle_json_to_layout_elements(jso, local_image_dir) - - except Exception as e: - logger.exception(e) - - return parse_doc(path) - - -if __name__ == '__main__': - import pprint - - base_dir = '/opt/data/pdf/resources/samples/' - if 0: - with open(base_dir + 'json_outputs/middle.json') as f: - d = json.load(f) - result = convert_middle_json_to_layout_elements(d, '/tmp') - pprint.pp(result) - if 0: - with open(base_dir + 'json_outputs/middle.3.json') as f: - d = json.load(f) - result = convert_middle_json_to_layout_elements(d, '/tmp') - pprint.pp(result) - - if 1: - res = inference( - base_dir + 'samples/pdf/one_page_with_table_image.pdf', - '/tmp/output', - 'ocr', - ) - pprint.pp(res) diff --git a/magic_pdf/libs/__init__.py b/magic_pdf/libs/__init__.py deleted file mode 100644 index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000 diff --git a/magic_pdf/libs/boxbase.py b/magic_pdf/libs/boxbase.py deleted file mode 100644 index 2813121bb3fcde988d510b89646478c97461da74..0000000000000000000000000000000000000000 --- a/magic_pdf/libs/boxbase.py +++ /dev/null @@ -1,485 +0,0 @@ -import math - - -def _is_in_or_part_overlap(box1, box2) -> bool: - """两个bbox是否有部分重叠或者包含.""" - if box1 is None or box2 is None: - return False - - x0_1, y0_1, x1_1, y1_1 = box1 - x0_2, y0_2, x1_2, y1_2 = box2 - - return not (x1_1 < x0_2 or # box1在box2的左边 - x0_1 > x1_2 or # box1在box2的右边 - y1_1 < y0_2 or # box1在box2的上边 - y0_1 > y1_2) # box1在box2的下边 - - -def _is_in_or_part_overlap_with_area_ratio(box1, - box2, - area_ratio_threshold=0.6): - """判断box1是否在box2里面,或者box1和box2有部分重叠,且重叠面积占box1的比例超过area_ratio_threshold.""" - if box1 is None or box2 is None: - return False - - x0_1, y0_1, x1_1, y1_1 = box1 - x0_2, y0_2, x1_2, y1_2 = box2 - - if not _is_in_or_part_overlap(box1, box2): - return False - - # 计算重叠面积 - x_left = max(x0_1, x0_2) - y_top = max(y0_1, y0_2) - x_right = min(x1_1, x1_2) - y_bottom = min(y1_1, y1_2) - overlap_area = (x_right - x_left) * (y_bottom - y_top) - - # 计算box1的面积 - box1_area = (x1_1 - x0_1) * (y1_1 - y0_1) - - return overlap_area / box1_area > area_ratio_threshold - - -def _is_in(box1, box2) -> bool: - """box1是否完全在box2里面.""" - x0_1, y0_1, x1_1, y1_1 = box1 - x0_2, y0_2, x1_2, y1_2 = box2 - - return (x0_1 >= x0_2 and # box1的左边界不在box2的左边外 - y0_1 >= y0_2 and # box1的上边界不在box2的上边外 - x1_1 <= x1_2 and # box1的右边界不在box2的右边外 - y1_1 <= y1_2) # box1的下边界不在box2的下边外 - - -def _is_part_overlap(box1, box2) -> bool: - """两个bbox是否有部分重叠,但不完全包含.""" - if box1 is None or box2 is None: - return False - - return _is_in_or_part_overlap(box1, box2) and not _is_in(box1, box2) - - -def _left_intersect(left_box, right_box): - """检查两个box的左边界是否有交集,也就是left_box的右边界是否在right_box的左边界内.""" - if left_box is None or right_box is None: - return False - - x0_1, y0_1, x1_1, y1_1 = left_box - x0_2, y0_2, x1_2, y1_2 = right_box - - return x1_1 > x0_2 and x0_1 < x0_2 and (y0_1 <= y0_2 <= y1_1 - or y0_1 <= y1_2 <= y1_1) - - -def _right_intersect(left_box, right_box): - """检查box是否在右侧边界有交集,也就是left_box的左边界是否在right_box的右边界内.""" - if left_box is None or right_box is None: - return False - - x0_1, y0_1, x1_1, y1_1 = left_box - x0_2, y0_2, x1_2, y1_2 = right_box - - return x0_1 < x1_2 and x1_1 > x1_2 and (y0_1 <= y0_2 <= y1_1 - or y0_1 <= y1_2 <= y1_1) - - -def _is_vertical_full_overlap(box1, box2, x_torlence=2): - """x方向上:要么box1包含box2, 要么box2包含box1。不能部分包含 y方向上:box1和box2有重叠.""" - # 解析box的坐标 - x11, y11, x12, y12 = box1 # 左上角和右下角的坐标 (x1, y1, x2, y2) - x21, y21, x22, y22 = box2 - - # 在x轴方向上,box1是否包含box2 或 box2包含box1 - contains_in_x = (x11 - x_torlence <= x21 and x12 + x_torlence >= x22) or ( - x21 - x_torlence <= x11 and x22 + x_torlence >= x12) - - # 在y轴方向上,box1和box2是否有重叠 - overlap_in_y = not (y12 < y21 or y11 > y22) - - return contains_in_x and overlap_in_y - - -def _is_bottom_full_overlap(box1, box2, y_tolerance=2): - """检查box1下方和box2的上方有轻微的重叠,轻微程度收到y_tolerance的限制 这个函数和_is_vertical- - full_overlap的区别是,这个函数允许box1和box2在x方向上有轻微的重叠,允许一定的模糊度.""" - if box1 is None or box2 is None: - return False - - x0_1, y0_1, x1_1, y1_1 = box1 - x0_2, y0_2, x1_2, y1_2 = box2 - tolerance_margin = 2 - is_xdir_full_overlap = ( - (x0_1 - tolerance_margin <= x0_2 <= x1_1 + tolerance_margin - and x0_1 - tolerance_margin <= x1_2 <= x1_1 + tolerance_margin) - or (x0_2 - tolerance_margin <= x0_1 <= x1_2 + tolerance_margin - and x0_2 - tolerance_margin <= x1_1 <= x1_2 + tolerance_margin)) - - return y0_2 < y1_1 and 0 < (y1_1 - - y0_2) < y_tolerance and is_xdir_full_overlap - - -def _is_left_overlap( - box1, - box2, -): - """检查box1的左侧是否和box2有重叠 在Y方向上可以是部分重叠或者是完全重叠。不分box1和box2的上下关系,也就是无论box1在box2下 - 方还是box2在box1下方,都可以检测到重叠。 X方向上.""" - - def __overlap_y(Ay1, Ay2, By1, By2): - return max(0, min(Ay2, By2) - max(Ay1, By1)) - - if box1 is None or box2 is None: - return False - - x0_1, y0_1, x1_1, y1_1 = box1 - x0_2, y0_2, x1_2, y1_2 = box2 - - y_overlap_len = __overlap_y(y0_1, y1_1, y0_2, y1_2) - ratio_1 = 1.0 * y_overlap_len / (y1_1 - y0_1) if y1_1 - y0_1 != 0 else 0 - ratio_2 = 1.0 * y_overlap_len / (y1_2 - y0_2) if y1_2 - y0_2 != 0 else 0 - vertical_overlap_cond = ratio_1 >= 0.5 or ratio_2 >= 0.5 - - # vertical_overlap_cond = y0_1<=y0_2<=y1_1 or y0_1<=y1_2<=y1_1 or y0_2<=y0_1<=y1_2 or y0_2<=y1_1<=y1_2 - return x0_1 <= x0_2 <= x1_1 and vertical_overlap_cond - - -def __is_overlaps_y_exceeds_threshold(bbox1, - bbox2, - overlap_ratio_threshold=0.8): - """检查两个bbox在y轴上是否有重叠,并且该重叠区域的高度占两个bbox高度更低的那个超过80%""" - _, y0_1, _, y1_1 = bbox1 - _, y0_2, _, y1_2 = bbox2 - - overlap = max(0, min(y1_1, y1_2) - max(y0_1, y0_2)) - height1, height2 = y1_1 - y0_1, y1_2 - y0_2 - # max_height = max(height1, height2) - min_height = min(height1, height2) - - return (overlap / min_height) > overlap_ratio_threshold - - -def calculate_iou(bbox1, bbox2): - """计算两个边界框的交并比(IOU)。 - - Args: - bbox1 (list[float]): 第一个边界框的坐标,格式为 [x1, y1, x2, y2],其中 (x1, y1) 为左上角坐标,(x2, y2) 为右下角坐标。 - bbox2 (list[float]): 第二个边界框的坐标,格式与 `bbox1` 相同。 - - Returns: - float: 两个边界框的交并比(IOU),取值范围为 [0, 1]。 - """ - # Determine the coordinates of the intersection rectangle - x_left = max(bbox1[0], bbox2[0]) - y_top = max(bbox1[1], bbox2[1]) - x_right = min(bbox1[2], bbox2[2]) - y_bottom = min(bbox1[3], bbox2[3]) - - if x_right < x_left or y_bottom < y_top: - return 0.0 - - # The area of overlap area - intersection_area = (x_right - x_left) * (y_bottom - y_top) - - # The area of both rectangles - bbox1_area = (bbox1[2] - bbox1[0]) * (bbox1[3] - bbox1[1]) - bbox2_area = (bbox2[2] - bbox2[0]) * (bbox2[3] - bbox2[1]) - - if any([bbox1_area == 0, bbox2_area == 0]): - return 0 - - # Compute the intersection over union by taking the intersection area - # and dividing it by the sum of both areas minus the intersection area - iou = intersection_area / float(bbox1_area + bbox2_area - intersection_area) - - return iou - - -def calculate_overlap_area_2_minbox_area_ratio(bbox1, bbox2): - """计算box1和box2的重叠面积占最小面积的box的比例.""" - # Determine the coordinates of the intersection rectangle - x_left = max(bbox1[0], bbox2[0]) - y_top = max(bbox1[1], bbox2[1]) - x_right = min(bbox1[2], bbox2[2]) - y_bottom = min(bbox1[3], bbox2[3]) - - if x_right < x_left or y_bottom < y_top: - return 0.0 - - # The area of overlap area - intersection_area = (x_right - x_left) * (y_bottom - y_top) - min_box_area = min([(bbox1[2] - bbox1[0]) * (bbox1[3] - bbox1[1]), - (bbox2[3] - bbox2[1]) * (bbox2[2] - bbox2[0])]) - if min_box_area == 0: - return 0 - else: - return intersection_area / min_box_area - - -def calculate_overlap_area_in_bbox1_area_ratio(bbox1, bbox2): - """计算box1和box2的重叠面积占bbox1的比例.""" - # Determine the coordinates of the intersection rectangle - x_left = max(bbox1[0], bbox2[0]) - y_top = max(bbox1[1], bbox2[1]) - x_right = min(bbox1[2], bbox2[2]) - y_bottom = min(bbox1[3], bbox2[3]) - - if x_right < x_left or y_bottom < y_top: - return 0.0 - - # The area of overlap area - intersection_area = (x_right - x_left) * (y_bottom - y_top) - bbox1_area = (bbox1[2] - bbox1[0]) * (bbox1[3] - bbox1[1]) - if bbox1_area == 0: - return 0 - else: - return intersection_area / bbox1_area - - -def get_minbox_if_overlap_by_ratio(bbox1, bbox2, ratio): - """通过calculate_overlap_area_2_minbox_area_ratio计算两个bbox重叠的面积占最小面积的box的比例 - 如果比例大于ratio,则返回小的那个bbox, 否则返回None.""" - x1_min, y1_min, x1_max, y1_max = bbox1 - x2_min, y2_min, x2_max, y2_max = bbox2 - area1 = (x1_max - x1_min) * (y1_max - y1_min) - area2 = (x2_max - x2_min) * (y2_max - y2_min) - overlap_ratio = calculate_overlap_area_2_minbox_area_ratio(bbox1, bbox2) - if overlap_ratio > ratio: - if area1 <= area2: - return bbox1 - else: - return bbox2 - else: - return None - - -def get_bbox_in_boundary(bboxes: list, boundary: tuple) -> list: - x0, y0, x1, y1 = boundary - new_boxes = [ - box for box in bboxes - if box[0] >= x0 and box[1] >= y0 and box[2] <= x1 and box[3] <= y1 - ] - return new_boxes - - -def is_vbox_on_side(bbox, width, height, side_threshold=0.2): - """判断一个bbox是否在pdf页面的边缘.""" - x0, x1 = bbox[0], bbox[2] - if x1 <= width * side_threshold or x0 >= width * (1 - side_threshold): - return True - return False - - -def find_top_nearest_text_bbox(pymu_blocks, obj_bbox): - tolerance_margin = 4 - top_boxes = [ - box for box in pymu_blocks - if obj_bbox[1] - box['bbox'][3] >= -tolerance_margin - and not _is_in(box['bbox'], obj_bbox) - ] - # 然后找到X方向上有互相重叠的 - top_boxes = [ - box for box in top_boxes if any([ - obj_bbox[0] - tolerance_margin <= box['bbox'][0] <= obj_bbox[2] + - tolerance_margin, obj_bbox[0] - - tolerance_margin <= box['bbox'][2] <= obj_bbox[2] + - tolerance_margin, box['bbox'][0] - - tolerance_margin <= obj_bbox[0] <= box['bbox'][2] + - tolerance_margin, box['bbox'][0] - - tolerance_margin <= obj_bbox[2] <= box['bbox'][2] + - tolerance_margin - ]) - ] - - # 然后找到y1最大的那个 - if len(top_boxes) > 0: - top_boxes.sort(key=lambda x: x['bbox'][3], reverse=True) - return top_boxes[0] - else: - return None - - -def find_bottom_nearest_text_bbox(pymu_blocks, obj_bbox): - bottom_boxes = [ - box for box in pymu_blocks if box['bbox'][1] - - obj_bbox[3] >= -2 and not _is_in(box['bbox'], obj_bbox) - ] - # 然后找到X方向上有互相重叠的 - bottom_boxes = [ - box for box in bottom_boxes if any([ - obj_bbox[0] - 2 <= box['bbox'][0] <= obj_bbox[2] + 2, obj_bbox[0] - - 2 <= box['bbox'][2] <= obj_bbox[2] + 2, box['bbox'][0] - - 2 <= obj_bbox[0] <= box['bbox'][2] + 2, box['bbox'][0] - - 2 <= obj_bbox[2] <= box['bbox'][2] + 2 - ]) - ] - - # 然后找到y0最小的那个 - if len(bottom_boxes) > 0: - bottom_boxes.sort(key=lambda x: x['bbox'][1], reverse=False) - return bottom_boxes[0] - else: - return None - - -def find_left_nearest_text_bbox(pymu_blocks, obj_bbox): - """寻找左侧最近的文本block.""" - left_boxes = [ - box for box in pymu_blocks if obj_bbox[0] - - box['bbox'][2] >= -2 and not _is_in(box['bbox'], obj_bbox) - ] - # 然后找到X方向上有互相重叠的 - left_boxes = [ - box for box in left_boxes if any([ - obj_bbox[1] - 2 <= box['bbox'][1] <= obj_bbox[3] + 2, obj_bbox[1] - - 2 <= box['bbox'][3] <= obj_bbox[3] + 2, box['bbox'][1] - - 2 <= obj_bbox[1] <= box['bbox'][3] + 2, box['bbox'][1] - - 2 <= obj_bbox[3] <= box['bbox'][3] + 2 - ]) - ] - - # 然后找到x1最大的那个 - if len(left_boxes) > 0: - left_boxes.sort(key=lambda x: x['bbox'][2], reverse=True) - return left_boxes[0] - else: - return None - - -def find_right_nearest_text_bbox(pymu_blocks, obj_bbox): - """寻找右侧最近的文本block.""" - right_boxes = [ - box for box in pymu_blocks if box['bbox'][0] - - obj_bbox[2] >= -2 and not _is_in(box['bbox'], obj_bbox) - ] - # 然后找到X方向上有互相重叠的 - right_boxes = [ - box for box in right_boxes if any([ - obj_bbox[1] - 2 <= box['bbox'][1] <= obj_bbox[3] + 2, obj_bbox[1] - - 2 <= box['bbox'][3] <= obj_bbox[3] + 2, box['bbox'][1] - - 2 <= obj_bbox[1] <= box['bbox'][3] + 2, box['bbox'][1] - - 2 <= obj_bbox[3] <= box['bbox'][3] + 2 - ]) - ] - - # 然后找到x0最小的那个 - if len(right_boxes) > 0: - right_boxes.sort(key=lambda x: x['bbox'][0], reverse=False) - return right_boxes[0] - else: - return None - - -def bbox_relative_pos(bbox1, bbox2): - """判断两个矩形框的相对位置关系. - - Args: - bbox1: 一个四元组,表示第一个矩形框的左上角和右下角的坐标,格式为(x1, y1, x1b, y1b) - bbox2: 一个四元组,表示第二个矩形框的左上角和右下角的坐标,格式为(x2, y2, x2b, y2b) - - Returns: - 一个四元组,表示矩形框1相对于矩形框2的位置关系,格式为(left, right, bottom, top) - 其中,left表示矩形框1是否在矩形框2的左侧,right表示矩形框1是否在矩形框2的右侧, - bottom表示矩形框1是否在矩形框2的下方,top表示矩形框1是否在矩形框2的上方 - """ - x1, y1, x1b, y1b = bbox1 - x2, y2, x2b, y2b = bbox2 - - left = x2b < x1 - right = x1b < x2 - bottom = y2b < y1 - top = y1b < y2 - return left, right, bottom, top - - -def bbox_distance(bbox1, bbox2): - """计算两个矩形框的距离。 - - Args: - bbox1 (tuple): 第一个矩形框的坐标,格式为 (x1, y1, x2, y2),其中 (x1, y1) 为左上角坐标,(x2, y2) 为右下角坐标。 - bbox2 (tuple): 第二个矩形框的坐标,格式为 (x1, y1, x2, y2),其中 (x1, y1) 为左上角坐标,(x2, y2) 为右下角坐标。 - - Returns: - float: 矩形框之间的距离。 - """ - - def dist(point1, point2): - return math.sqrt((point1[0] - point2[0])**2 + - (point1[1] - point2[1])**2) - - x1, y1, x1b, y1b = bbox1 - x2, y2, x2b, y2b = bbox2 - - left, right, bottom, top = bbox_relative_pos(bbox1, bbox2) - - if top and left: - return dist((x1, y1b), (x2b, y2)) - elif left and bottom: - return dist((x1, y1), (x2b, y2b)) - elif bottom and right: - return dist((x1b, y1), (x2, y2b)) - elif right and top: - return dist((x1b, y1b), (x2, y2)) - elif left: - return x1 - x2b - elif right: - return x2 - x1b - elif bottom: - return y1 - y2b - elif top: - return y2 - y1b - return 0.0 - - -def box_area(bbox): - return (bbox[2] - bbox[0]) * (bbox[3] - bbox[1]) - - -def get_overlap_area(bbox1, bbox2): - """计算box1和box2的重叠面积占bbox1的比例.""" - # Determine the coordinates of the intersection rectangle - x_left = max(bbox1[0], bbox2[0]) - y_top = max(bbox1[1], bbox2[1]) - x_right = min(bbox1[2], bbox2[2]) - y_bottom = min(bbox1[3], bbox2[3]) - - if x_right < x_left or y_bottom < y_top: - return 0.0 - - # The area of overlap area - return (x_right - x_left) * (y_bottom - y_top) - - -def calculate_vertical_projection_overlap_ratio(block1, block2): - """ - Calculate the proportion of the x-axis covered by the vertical projection of two blocks. - - Args: - block1 (tuple): Coordinates of the first block (x0, y0, x1, y1). - block2 (tuple): Coordinates of the second block (x0, y0, x1, y1). - - Returns: - float: The proportion of the x-axis covered by the vertical projection of the two blocks. - """ - x0_1, _, x1_1, _ = block1 - x0_2, _, x1_2, _ = block2 - - # Calculate the intersection of the x-coordinates - x_left = max(x0_1, x0_2) - x_right = min(x1_1, x1_2) - - if x_right < x_left: - return 0.0 - - # Length of the intersection - intersection_length = x_right - x_left - - # Length of the x-axis projection of the first block - block1_length = x1_1 - x0_1 - - if block1_length == 0: - return 0.0 - - # Proportion of the x-axis covered by the intersection - # logger.info(f"intersection_length: {intersection_length}, block1_length: {block1_length}") - return intersection_length / block1_length diff --git a/magic_pdf/libs/clean_memory.py b/magic_pdf/libs/clean_memory.py deleted file mode 100644 index 930b99eadb71463816d938936649d82905723bd0..0000000000000000000000000000000000000000 --- a/magic_pdf/libs/clean_memory.py +++ /dev/null @@ -1,17 +0,0 @@ -# Copyright (c) Opendatalab. All rights reserved. -import torch -import gc - - -def clean_memory(device='cuda'): - if device == 'cuda': - if torch.cuda.is_available(): - torch.cuda.empty_cache() - torch.cuda.ipc_collect() - elif str(device).startswith("npu"): - import torch_npu - if torch_npu.npu.is_available(): - torch_npu.npu.empty_cache() - elif str(device).startswith("mps"): - torch.mps.empty_cache() - gc.collect() \ No newline at end of file diff --git a/magic_pdf/libs/commons.py b/magic_pdf/libs/commons.py deleted file mode 100644 index 20f29ffd309737cfd06f04fa0426eab1ceb4a4b9..0000000000000000000000000000000000000000 --- a/magic_pdf/libs/commons.py +++ /dev/null @@ -1,43 +0,0 @@ - -def join_path(*args): - return '/'.join(str(s).rstrip('/') for s in args) - - -def get_top_percent_list(num_list, percent): - """ - 获取列表中前百分之多少的元素 - :param num_list: - :param percent: - :return: - """ - if len(num_list) == 0: - top_percent_list = [] - else: - # 对imgs_len_list排序 - sorted_imgs_len_list = sorted(num_list, reverse=True) - # 计算 percent 的索引 - top_percent_index = int(len(sorted_imgs_len_list) * percent) - # 取前80%的元素 - top_percent_list = sorted_imgs_len_list[:top_percent_index] - return top_percent_list - - -def mymax(alist: list): - if len(alist) == 0: - return 0 # 空是0, 0*0也是0大小q - else: - return max(alist) - - -def parse_bucket_key(s3_full_path: str): - """ - 输入 s3://bucket/path/to/my/file.txt - 输出 bucket, path/to/my/file.txt - """ - s3_full_path = s3_full_path.strip() - if s3_full_path.startswith("s3://"): - s3_full_path = s3_full_path[5:] - if s3_full_path.startswith("/"): - s3_full_path = s3_full_path[1:] - bucket, key = s3_full_path.split("/", 1) - return bucket, key diff --git a/magic_pdf/libs/config_reader.py b/magic_pdf/libs/config_reader.py deleted file mode 100644 index 2b7e949621a606e4f8a83865945c501056fbefb6..0000000000000000000000000000000000000000 --- a/magic_pdf/libs/config_reader.py +++ /dev/null @@ -1,139 +0,0 @@ -"""根据bucket的名字返回对应的s3 AK, SK,endpoint三元组.""" - -import json -import os - -from loguru import logger - -from magic_pdf.config.constants import MODEL_NAME -from magic_pdf.libs.commons import parse_bucket_key - -# 定义配置文件名常量 -CONFIG_FILE_NAME = os.getenv('MINERU_TOOLS_CONFIG_JSON', 'magic-pdf.json') - - -def read_config(): - if os.path.isabs(CONFIG_FILE_NAME): - config_file = CONFIG_FILE_NAME - else: - home_dir = os.path.expanduser('~') - config_file = os.path.join(home_dir, CONFIG_FILE_NAME) - - if not os.path.exists(config_file): - raise FileNotFoundError(f'{config_file} not found') - - with open(config_file, 'r', encoding='utf-8') as f: - config = json.load(f) - return config - - -def get_s3_config(bucket_name: str): - """~/magic-pdf.json 读出来.""" - config = read_config() - - bucket_info = config.get('bucket_info') - if bucket_name not in bucket_info: - access_key, secret_key, storage_endpoint = bucket_info['[default]'] - else: - access_key, secret_key, storage_endpoint = bucket_info[bucket_name] - - if access_key is None or secret_key is None or storage_endpoint is None: - raise Exception(f'ak, sk or endpoint not found in {CONFIG_FILE_NAME}') - - # logger.info(f"get_s3_config: ak={access_key}, sk={secret_key}, endpoint={storage_endpoint}") - - return access_key, secret_key, storage_endpoint - - -def get_s3_config_dict(path: str): - access_key, secret_key, storage_endpoint = get_s3_config(get_bucket_name(path)) - return {'ak': access_key, 'sk': secret_key, 'endpoint': storage_endpoint} - - -def get_bucket_name(path): - bucket, key = parse_bucket_key(path) - return bucket - - -def get_local_models_dir(): - config = read_config() - models_dir = config.get('models-dir') - if models_dir is None: - logger.warning(f"'models-dir' not found in {CONFIG_FILE_NAME}, use '/tmp/models' as default") - return '/tmp/models' - else: - return models_dir - - -def get_local_layoutreader_model_dir(): - config = read_config() - layoutreader_model_dir = config.get('layoutreader-model-dir') - if layoutreader_model_dir is None or not os.path.exists(layoutreader_model_dir): - home_dir = os.path.expanduser('~') - layoutreader_at_modelscope_dir_path = os.path.join(home_dir, '.cache/modelscope/hub/ppaanngggg/layoutreader') - logger.warning(f"'layoutreader-model-dir' not exists, use {layoutreader_at_modelscope_dir_path} as default") - return layoutreader_at_modelscope_dir_path - else: - return layoutreader_model_dir - - -def get_device(): - config = read_config() - device = config.get('device-mode') - if device is None: - logger.warning(f"'device-mode' not found in {CONFIG_FILE_NAME}, use 'cpu' as default") - return 'cpu' - else: - return device - - -def get_table_recog_config(): - config = read_config() - table_config = config.get('table-config') - if table_config is None: - logger.warning(f"'table-config' not found in {CONFIG_FILE_NAME}, use 'False' as default") - return json.loads(f'{{"model": "{MODEL_NAME.RAPID_TABLE}","enable": false, "max_time": 400}}') - else: - return table_config - - -def get_layout_config(): - config = read_config() - layout_config = config.get('layout-config') - if layout_config is None: - logger.warning(f"'layout-config' not found in {CONFIG_FILE_NAME}, use '{MODEL_NAME.LAYOUTLMv3}' as default") - return json.loads(f'{{"model": "{MODEL_NAME.LAYOUTLMv3}"}}') - else: - return layout_config - - -def get_formula_config(): - config = read_config() - formula_config = config.get('formula-config') - if formula_config is None: - logger.warning(f"'formula-config' not found in {CONFIG_FILE_NAME}, use 'True' as default") - return json.loads(f'{{"mfd_model": "{MODEL_NAME.YOLO_V8_MFD}","mfr_model": "{MODEL_NAME.UniMerNet_v2_Small}","enable": true}}') - else: - return formula_config - -def get_llm_aided_config(): - config = read_config() - llm_aided_config = config.get('llm-aided-config') - if llm_aided_config is None: - logger.warning(f"'llm-aided-config' not found in {CONFIG_FILE_NAME}, use 'None' as default") - return None - else: - return llm_aided_config - -def get_latex_delimiter_config(): - config = read_config() - latex_delimiter_config = config.get('latex-delimiter-config') - if latex_delimiter_config is None: - logger.warning(f"'latex-delimiter-config' not found in {CONFIG_FILE_NAME}, use 'None' as default") - return None - else: - return latex_delimiter_config - - -if __name__ == '__main__': - ak, sk, endpoint = get_s3_config('llm-raw') diff --git a/magic_pdf/libs/convert_utils.py b/magic_pdf/libs/convert_utils.py deleted file mode 100644 index 99a1879d46befa2de63aa1a379ab83dbf6fdb1f1..0000000000000000000000000000000000000000 --- a/magic_pdf/libs/convert_utils.py +++ /dev/null @@ -1,5 +0,0 @@ -def dict_to_list(input_dict): - items_list = [] - for _, item in input_dict.items(): - items_list.append(item) - return items_list diff --git a/magic_pdf/libs/coordinate_transform.py b/magic_pdf/libs/coordinate_transform.py deleted file mode 100644 index 7cd7a0768596174d71ea8b3c8309c0ec998b3c81..0000000000000000000000000000000000000000 --- a/magic_pdf/libs/coordinate_transform.py +++ /dev/null @@ -1,9 +0,0 @@ -def get_scale_ratio(model_page_info, page): - pix = page.get_pixmap(dpi=72) - pymu_width = int(pix.w) - pymu_height = int(pix.h) - width_from_json = model_page_info['page_info']['width'] - height_from_json = model_page_info['page_info']['height'] - horizontal_scale_ratio = width_from_json / pymu_width - vertical_scale_ratio = height_from_json / pymu_height - return horizontal_scale_ratio, vertical_scale_ratio diff --git a/magic_pdf/libs/draw_bbox.py b/magic_pdf/libs/draw_bbox.py deleted file mode 100644 index c2ad21d091cff9c2d3026f97da486129b6b34edf..0000000000000000000000000000000000000000 --- a/magic_pdf/libs/draw_bbox.py +++ /dev/null @@ -1,418 +0,0 @@ -import fitz -from magic_pdf.config.constants import CROSS_PAGE -from magic_pdf.config.ocr_content_type import (BlockType, CategoryId, - ContentType) -from magic_pdf.data.dataset import Dataset -from magic_pdf.model.magic_model import MagicModel - - -def draw_bbox_without_number(i, bbox_list, page, rgb_config, fill_config): - new_rgb = [] - for item in rgb_config: - item = float(item) / 255 - new_rgb.append(item) - page_data = bbox_list[i] - for bbox in page_data: - x0, y0, x1, y1 = bbox - rect_coords = fitz.Rect(x0, y0, x1, y1) # Define the rectangle - if fill_config: - page.draw_rect( - rect_coords, - color=None, - fill=new_rgb, - fill_opacity=0.3, - width=0.5, - overlay=True, - ) # Draw the rectangle - else: - page.draw_rect( - rect_coords, - color=new_rgb, - fill=None, - fill_opacity=1, - width=0.5, - overlay=True, - ) # Draw the rectangle - - -def draw_bbox_with_number(i, bbox_list, page, rgb_config, fill_config, draw_bbox=True): - new_rgb = [] - for item in rgb_config: - item = float(item) / 255 - new_rgb.append(item) - page_data = bbox_list[i] - for j, bbox in enumerate(page_data): - x0, y0, x1, y1 = bbox - rect_coords = fitz.Rect(x0, y0, x1, y1) # Define the rectangle - if draw_bbox: - if fill_config: - page.draw_rect( - rect_coords, - color=None, - fill=new_rgb, - fill_opacity=0.3, - width=0.5, - overlay=True, - ) # Draw the rectangle - else: - page.draw_rect( - rect_coords, - color=new_rgb, - fill=None, - fill_opacity=1, - width=0.5, - overlay=True, - ) # Draw the rectangle - page.insert_text( - (x1 + 2, y0 + 10), str(j + 1), fontsize=10, color=new_rgb - ) # Insert the index in the top left corner of the rectangle - - -def draw_layout_bbox(pdf_info, pdf_bytes, out_path, filename): - dropped_bbox_list = [] - tables_list, tables_body_list = [], [] - tables_caption_list, tables_footnote_list = [], [] - imgs_list, imgs_body_list, imgs_caption_list = [], [], [] - imgs_footnote_list = [] - titles_list = [] - texts_list = [] - interequations_list = [] - lists_list = [] - indexs_list = [] - for page in pdf_info: - - page_dropped_list = [] - tables, tables_body, tables_caption, tables_footnote = [], [], [], [] - imgs, imgs_body, imgs_caption, imgs_footnote = [], [], [], [] - titles = [] - texts = [] - interequations = [] - lists = [] - indices = [] - - for dropped_bbox in page['discarded_blocks']: - page_dropped_list.append(dropped_bbox['bbox']) - dropped_bbox_list.append(page_dropped_list) - for block in page['para_blocks']: - bbox = block['bbox'] - if block['type'] == BlockType.Table: - tables.append(bbox) - for nested_block in block['blocks']: - bbox = nested_block['bbox'] - if nested_block['type'] == BlockType.TableBody: - tables_body.append(bbox) - elif nested_block['type'] == BlockType.TableCaption: - tables_caption.append(bbox) - elif nested_block['type'] == BlockType.TableFootnote: - tables_footnote.append(bbox) - elif block['type'] == BlockType.Image: - imgs.append(bbox) - for nested_block in block['blocks']: - bbox = nested_block['bbox'] - if nested_block['type'] == BlockType.ImageBody: - imgs_body.append(bbox) - elif nested_block['type'] == BlockType.ImageCaption: - imgs_caption.append(bbox) - elif nested_block['type'] == BlockType.ImageFootnote: - imgs_footnote.append(bbox) - elif block['type'] == BlockType.Title: - titles.append(bbox) - elif block['type'] == BlockType.Text: - texts.append(bbox) - elif block['type'] == BlockType.InterlineEquation: - interequations.append(bbox) - elif block['type'] == BlockType.List: - lists.append(bbox) - elif block['type'] == BlockType.Index: - indices.append(bbox) - - tables_list.append(tables) - tables_body_list.append(tables_body) - tables_caption_list.append(tables_caption) - tables_footnote_list.append(tables_footnote) - imgs_list.append(imgs) - imgs_body_list.append(imgs_body) - imgs_caption_list.append(imgs_caption) - imgs_footnote_list.append(imgs_footnote) - titles_list.append(titles) - texts_list.append(texts) - interequations_list.append(interequations) - lists_list.append(lists) - indexs_list.append(indices) - - layout_bbox_list = [] - - table_type_order = { - 'table_caption': 1, - 'table_body': 2, - 'table_footnote': 3 - } - for page in pdf_info: - page_block_list = [] - for block in page['para_blocks']: - if block['type'] in [ - BlockType.Text, - BlockType.Title, - BlockType.InterlineEquation, - BlockType.List, - BlockType.Index, - ]: - bbox = block['bbox'] - page_block_list.append(bbox) - elif block['type'] in [BlockType.Image]: - for sub_block in block['blocks']: - bbox = sub_block['bbox'] - page_block_list.append(bbox) - elif block['type'] in [BlockType.Table]: - sorted_blocks = sorted(block['blocks'], key=lambda x: table_type_order[x['type']]) - for sub_block in sorted_blocks: - bbox = sub_block['bbox'] - page_block_list.append(bbox) - - layout_bbox_list.append(page_block_list) - - pdf_docs = fitz.open('pdf', pdf_bytes) - - for i, page in enumerate(pdf_docs): - - draw_bbox_without_number(i, dropped_bbox_list, page, [158, 158, 158], True) - # draw_bbox_without_number(i, tables_list, page, [153, 153, 0], True) # color ! - draw_bbox_without_number(i, tables_body_list, page, [204, 204, 0], True) - draw_bbox_without_number(i, tables_caption_list, page, [255, 255, 102], True) - draw_bbox_without_number(i, tables_footnote_list, page, [229, 255, 204], True) - # draw_bbox_without_number(i, imgs_list, page, [51, 102, 0], True) - draw_bbox_without_number(i, imgs_body_list, page, [153, 255, 51], True) - draw_bbox_without_number(i, imgs_caption_list, page, [102, 178, 255], True) - draw_bbox_without_number(i, imgs_footnote_list, page, [255, 178, 102], True), - draw_bbox_without_number(i, titles_list, page, [102, 102, 255], True) - draw_bbox_without_number(i, texts_list, page, [153, 0, 76], True) - draw_bbox_without_number(i, interequations_list, page, [0, 255, 0], True) - draw_bbox_without_number(i, lists_list, page, [40, 169, 92], True) - draw_bbox_without_number(i, indexs_list, page, [40, 169, 92], True) - - draw_bbox_with_number( - i, layout_bbox_list, page, [255, 0, 0], False, draw_bbox=False - ) - - # Save the PDF - pdf_docs.save(f'{out_path}/{filename}') - - -def draw_span_bbox(pdf_info, pdf_bytes, out_path, filename): - text_list = [] - inline_equation_list = [] - interline_equation_list = [] - image_list = [] - table_list = [] - dropped_list = [] - next_page_text_list = [] - next_page_inline_equation_list = [] - - def get_span_info(span): - if span['type'] == ContentType.Text: - if span.get(CROSS_PAGE, False): - next_page_text_list.append(span['bbox']) - else: - page_text_list.append(span['bbox']) - elif span['type'] == ContentType.InlineEquation: - if span.get(CROSS_PAGE, False): - next_page_inline_equation_list.append(span['bbox']) - else: - page_inline_equation_list.append(span['bbox']) - elif span['type'] == ContentType.InterlineEquation: - page_interline_equation_list.append(span['bbox']) - elif span['type'] == ContentType.Image: - page_image_list.append(span['bbox']) - elif span['type'] == ContentType.Table: - page_table_list.append(span['bbox']) - - for page in pdf_info: - page_text_list = [] - page_inline_equation_list = [] - page_interline_equation_list = [] - page_image_list = [] - page_table_list = [] - page_dropped_list = [] - - # 将跨页的span放到移动到下一页的列表中 - if len(next_page_text_list) > 0: - page_text_list.extend(next_page_text_list) - next_page_text_list.clear() - if len(next_page_inline_equation_list) > 0: - page_inline_equation_list.extend(next_page_inline_equation_list) - next_page_inline_equation_list.clear() - - # 构造dropped_list - for block in page['discarded_blocks']: - if block['type'] == BlockType.Discarded: - for line in block['lines']: - for span in line['spans']: - page_dropped_list.append(span['bbox']) - dropped_list.append(page_dropped_list) - # 构造其余useful_list - # for block in page['para_blocks']: # span直接用分段合并前的结果就可以 - for block in page['preproc_blocks']: - if block['type'] in [ - BlockType.Text, - BlockType.Title, - BlockType.InterlineEquation, - BlockType.List, - BlockType.Index, - ]: - for line in block['lines']: - for span in line['spans']: - get_span_info(span) - elif block['type'] in [BlockType.Image, BlockType.Table]: - for sub_block in block['blocks']: - for line in sub_block['lines']: - for span in line['spans']: - get_span_info(span) - text_list.append(page_text_list) - inline_equation_list.append(page_inline_equation_list) - interline_equation_list.append(page_interline_equation_list) - image_list.append(page_image_list) - table_list.append(page_table_list) - pdf_docs = fitz.open('pdf', pdf_bytes) - for i, page in enumerate(pdf_docs): - # 获取当前页面的数据 - draw_bbox_without_number(i, text_list, page, [255, 0, 0], False) - draw_bbox_without_number(i, inline_equation_list, page, [0, 255, 0], False) - draw_bbox_without_number(i, interline_equation_list, page, [0, 0, 255], False) - draw_bbox_without_number(i, image_list, page, [255, 204, 0], False) - draw_bbox_without_number(i, table_list, page, [204, 0, 255], False) - draw_bbox_without_number(i, dropped_list, page, [158, 158, 158], False) - - # Save the PDF - pdf_docs.save(f'{out_path}/{filename}') - - -def draw_model_bbox(model_list, dataset: Dataset, out_path, filename): - dropped_bbox_list = [] - tables_body_list, tables_caption_list, tables_footnote_list = [], [], [] - imgs_body_list, imgs_caption_list, imgs_footnote_list = [], [], [] - titles_list = [] - texts_list = [] - interequations_list = [] - magic_model = MagicModel(model_list, dataset) - for i in range(len(model_list)): - page_dropped_list = [] - tables_body, tables_caption, tables_footnote = [], [], [] - imgs_body, imgs_caption, imgs_footnote = [], [], [] - titles = [] - texts = [] - interequations = [] - page_info = magic_model.get_model_list(i) - layout_dets = page_info['layout_dets'] - for layout_det in layout_dets: - bbox = layout_det['bbox'] - if layout_det['category_id'] == CategoryId.Text: - texts.append(bbox) - elif layout_det['category_id'] == CategoryId.Title: - titles.append(bbox) - elif layout_det['category_id'] == CategoryId.TableBody: - tables_body.append(bbox) - elif layout_det['category_id'] == CategoryId.TableCaption: - tables_caption.append(bbox) - elif layout_det['category_id'] == CategoryId.TableFootnote: - tables_footnote.append(bbox) - elif layout_det['category_id'] == CategoryId.ImageBody: - imgs_body.append(bbox) - elif layout_det['category_id'] == CategoryId.ImageCaption: - imgs_caption.append(bbox) - elif layout_det['category_id'] == CategoryId.InterlineEquation_YOLO: - interequations.append(bbox) - elif layout_det['category_id'] == CategoryId.Abandon: - page_dropped_list.append(bbox) - elif layout_det['category_id'] == CategoryId.ImageFootnote: - imgs_footnote.append(bbox) - - tables_body_list.append(tables_body) - tables_caption_list.append(tables_caption) - tables_footnote_list.append(tables_footnote) - imgs_body_list.append(imgs_body) - imgs_caption_list.append(imgs_caption) - titles_list.append(titles) - texts_list.append(texts) - interequations_list.append(interequations) - dropped_bbox_list.append(page_dropped_list) - imgs_footnote_list.append(imgs_footnote) - - for i in range(len(dataset)): - page = dataset.get_page(i) - draw_bbox_with_number( - i, dropped_bbox_list, page, [158, 158, 158], True - ) # color ! - draw_bbox_with_number(i, tables_body_list, page, [204, 204, 0], True) - draw_bbox_with_number(i, tables_caption_list, page, [255, 255, 102], True) - draw_bbox_with_number(i, tables_footnote_list, page, [229, 255, 204], True) - draw_bbox_with_number(i, imgs_body_list, page, [153, 255, 51], True) - draw_bbox_with_number(i, imgs_caption_list, page, [102, 178, 255], True) - draw_bbox_with_number(i, imgs_footnote_list, page, [255, 178, 102], True) - draw_bbox_with_number(i, titles_list, page, [102, 102, 255], True) - draw_bbox_with_number(i, texts_list, page, [153, 0, 76], True) - draw_bbox_with_number(i, interequations_list, page, [0, 255, 0], True) - - # Save the PDF - dataset.dump_to_file(f'{out_path}/{filename}') - - -def draw_line_sort_bbox(pdf_info, pdf_bytes, out_path, filename): - layout_bbox_list = [] - - for page in pdf_info: - page_line_list = [] - for block in page['preproc_blocks']: - if block['type'] in [BlockType.Text]: - for line in block['lines']: - bbox = line['bbox'] - index = line['index'] - page_line_list.append({'index': index, 'bbox': bbox}) - elif block['type'] in [BlockType.Title, BlockType.InterlineEquation]: - if 'virtual_lines' in block: - if len(block['virtual_lines']) > 0 and block['virtual_lines'][0].get('index', None) is not None: - for line in block['virtual_lines']: - bbox = line['bbox'] - index = line['index'] - page_line_list.append({'index': index, 'bbox': bbox}) - else: - for line in block['lines']: - bbox = line['bbox'] - index = line['index'] - page_line_list.append({'index': index, 'bbox': bbox}) - elif block['type'] in [BlockType.Image, BlockType.Table]: - for sub_block in block['blocks']: - if sub_block['type'] in [BlockType.ImageBody, BlockType.TableBody]: - if len(sub_block['virtual_lines']) > 0 and sub_block['virtual_lines'][0].get('index', None) is not None: - for line in sub_block['virtual_lines']: - bbox = line['bbox'] - index = line['index'] - page_line_list.append({'index': index, 'bbox': bbox}) - else: - for line in sub_block['lines']: - bbox = line['bbox'] - index = line['index'] - page_line_list.append({'index': index, 'bbox': bbox}) - elif sub_block['type'] in [BlockType.ImageCaption, BlockType.TableCaption, BlockType.ImageFootnote, BlockType.TableFootnote]: - for line in sub_block['lines']: - bbox = line['bbox'] - index = line['index'] - page_line_list.append({'index': index, 'bbox': bbox}) - sorted_bboxes = sorted(page_line_list, key=lambda x: x['index']) - layout_bbox_list.append(sorted_bbox['bbox'] for sorted_bbox in sorted_bboxes) - pdf_docs = fitz.open('pdf', pdf_bytes) - for i, page in enumerate(pdf_docs): - draw_bbox_with_number(i, layout_bbox_list, page, [255, 0, 0], False) - - pdf_docs.save(f'{out_path}/{filename}') - - -def draw_char_bbox(pdf_bytes, out_path, filename): - pdf_docs = fitz.open('pdf', pdf_bytes) - for i, page in enumerate(pdf_docs): - for block in page.get_text('rawdict', flags=fitz.TEXT_PRESERVE_LIGATURES | fitz.TEXT_PRESERVE_WHITESPACE | fitz.TEXT_MEDIABOX_CLIP)['blocks']: - for line in block['lines']: - for span in line['spans']: - for char in span['chars']: - char_bbox = char['bbox'] - page.draw_rect(char_bbox, color=[1, 0, 0], fill=None, fill_opacity=1, width=0.3, overlay=True,) - pdf_docs.save(f'{out_path}/{filename}') diff --git a/magic_pdf/libs/hash_utils.py b/magic_pdf/libs/hash_utils.py deleted file mode 100644 index 47b8aea746eb04eeb427b775227692ef6b4d9d29..0000000000000000000000000000000000000000 --- a/magic_pdf/libs/hash_utils.py +++ /dev/null @@ -1,15 +0,0 @@ -import hashlib - - -def compute_md5(file_bytes): - hasher = hashlib.md5() - hasher.update(file_bytes) - return hasher.hexdigest().upper() - - -def compute_sha256(input_string): - hasher = hashlib.sha256() - # 在Python3中,需要将字符串转化为字节对象才能被哈希函数处理 - input_bytes = input_string.encode('utf-8') - hasher.update(input_bytes) - return hasher.hexdigest() diff --git a/magic_pdf/libs/json_compressor.py b/magic_pdf/libs/json_compressor.py deleted file mode 100644 index 77ef1c876fcae0b34a42355b3edb079bb5dd891b..0000000000000000000000000000000000000000 --- a/magic_pdf/libs/json_compressor.py +++ /dev/null @@ -1,27 +0,0 @@ -import json -import brotli -import base64 - -class JsonCompressor: - - @staticmethod - def compress_json(data): - """ - Compress a json object and encode it with base64 - """ - json_str = json.dumps(data) - json_bytes = json_str.encode('utf-8') - compressed = brotli.compress(json_bytes, quality=6) - compressed_str = base64.b64encode(compressed).decode('utf-8') # convert bytes to string - return compressed_str - - @staticmethod - def decompress_json(compressed_str): - """ - Decode the base64 string and decompress the json object - """ - compressed = base64.b64decode(compressed_str.encode('utf-8')) # convert string to bytes - decompressed_bytes = brotli.decompress(compressed) - json_str = decompressed_bytes.decode('utf-8') - data = json.loads(json_str) - return data diff --git a/magic_pdf/libs/language.py b/magic_pdf/libs/language.py deleted file mode 100644 index 73d382b7c436f8c0a8a7498e4ea1584b0719e8a5..0000000000000000000000000000000000000000 --- a/magic_pdf/libs/language.py +++ /dev/null @@ -1,48 +0,0 @@ -import os -import unicodedata - -if not os.getenv("FTLANG_CACHE"): - current_file_path = os.path.abspath(__file__) - current_dir = os.path.dirname(current_file_path) - root_dir = os.path.dirname(current_dir) - ftlang_cache_dir = os.path.join(root_dir, 'resources', 'fasttext-langdetect') - os.environ["FTLANG_CACHE"] = str(ftlang_cache_dir) - # print(os.getenv("FTLANG_CACHE")) - -from fast_langdetect import detect_language - - -def remove_invalid_surrogates(text): - # 移除无效的 UTF-16 代理对 - return ''.join(c for c in text if not (0xD800 <= ord(c) <= 0xDFFF)) - - -def detect_lang(text: str) -> str: - - if len(text) == 0: - return "" - - text = text.replace("\n", "") - text = remove_invalid_surrogates(text) - - # print(text) - try: - lang_upper = detect_language(text) - except: - html_no_ctrl_chars = ''.join([l for l in text if unicodedata.category(l)[0] not in ['C', ]]) - lang_upper = detect_language(html_no_ctrl_chars) - - try: - lang = lang_upper.lower() - except: - lang = "" - return lang - - -if __name__ == '__main__': - print(os.getenv("FTLANG_CACHE")) - print(detect_lang("This is a test.")) - print(detect_lang("This is a test")) - print(detect_lang("这个是中文测试。")) - print(detect_lang("这个是中文测试。")) - print(detect_lang("〖\ud835\udc46\ud835〗这是个包含utf-16的中文测试")) \ No newline at end of file diff --git a/magic_pdf/libs/local_math.py b/magic_pdf/libs/local_math.py deleted file mode 100644 index 9edbcc7074dfa189a8508eb76366ae31dba4d665..0000000000000000000000000000000000000000 --- a/magic_pdf/libs/local_math.py +++ /dev/null @@ -1,9 +0,0 @@ -def float_gt(a, b): - if 0.0001 >= abs(a -b): - return False - return a > b - -def float_equal(a, b): - if 0.0001 >= abs(a-b): - return True - return False \ No newline at end of file diff --git a/magic_pdf/libs/markdown_utils.py b/magic_pdf/libs/markdown_utils.py deleted file mode 100644 index 036232c880b584573a4cd031fed4f457d8d63e6f..0000000000000000000000000000000000000000 --- a/magic_pdf/libs/markdown_utils.py +++ /dev/null @@ -1,10 +0,0 @@ - -def ocr_escape_special_markdown_char(content): - """ - 转义正文里对markdown语法有特殊意义的字符 - """ - special_chars = ["*", "`", "~", "$"] - for char in special_chars: - content = content.replace(char, "\\" + char) - - return content diff --git a/magic_pdf/libs/path_utils.py b/magic_pdf/libs/path_utils.py deleted file mode 100644 index 15fff01b5a698fbd6b1df11d9608b9ef12ffc715..0000000000000000000000000000000000000000 --- a/magic_pdf/libs/path_utils.py +++ /dev/null @@ -1,32 +0,0 @@ - - -def remove_non_official_s3_args(s3path): - """ - example: s3://abc/xxxx.json?bytes=0,81350 ==> s3://abc/xxxx.json - """ - arr = s3path.split("?") - return arr[0] - -def parse_s3path(s3path: str): - # from s3pathlib import S3Path - # p = S3Path(remove_non_official_s3_args(s3path)) - # return p.bucket, p.key - s3path = remove_non_official_s3_args(s3path).strip() - if s3path.startswith(('s3://', 's3a://')): - prefix, path = s3path.split('://', 1) - bucket_name, key = path.split('/', 1) - return bucket_name, key - elif s3path.startswith('/'): - raise ValueError("The provided path starts with '/'. This does not conform to a valid S3 path format.") - else: - raise ValueError("Invalid S3 path format. Expected 's3://bucket-name/key' or 's3a://bucket-name/key'.") - - -def parse_s3_range_params(s3path: str): - """ - example: s3://abc/xxxx.json?bytes=0,81350 ==> [0, 81350] - """ - arr = s3path.split("?bytes=") - if len(arr) == 1: - return None - return arr[1].split(",") diff --git a/magic_pdf/libs/pdf_check.py b/magic_pdf/libs/pdf_check.py deleted file mode 100644 index 98402b383b74800817a0770cb495e280a52b5e6c..0000000000000000000000000000000000000000 --- a/magic_pdf/libs/pdf_check.py +++ /dev/null @@ -1,99 +0,0 @@ -import fitz -import numpy as np -from loguru import logger -import re -from io import BytesIO -from pdfminer.high_level import extract_text -from pdfminer.layout import LAParams - - -def calculate_sample_count(total_page: int): - """ - 根据总页数和采样率计算采样页面的数量。 - """ - select_page_cnt = min(10, total_page) - return select_page_cnt - - -def extract_pages(src_pdf_bytes: bytes) -> fitz.Document: - pdf_docs = fitz.open("pdf", src_pdf_bytes) - total_page = len(pdf_docs) - if total_page == 0: - # 如果PDF没有页面,直接返回空文档 - logger.warning("PDF is empty, return empty document") - return fitz.Document() - select_page_cnt = calculate_sample_count(total_page) - - page_num = np.random.choice(total_page, select_page_cnt, replace=False) - sample_docs = fitz.Document() - try: - for index in page_num: - sample_docs.insert_pdf(pdf_docs, from_page=int(index), to_page=int(index)) - except Exception as e: - logger.exception(e) - return sample_docs - - -def detect_invalid_chars(src_pdf_bytes: bytes) -> bool: - """" - 检测PDF中是否包含非法字符 - """ - '''pdfminer比较慢,需要先随机抽取10页左右的sample''' - sample_docs = extract_pages(src_pdf_bytes) - sample_pdf_bytes = sample_docs.tobytes() - sample_pdf_file_like_object = BytesIO(sample_pdf_bytes) - laparams = LAParams( - line_overlap=0.5, - char_margin=2.0, - line_margin=0.5, - word_margin=0.1, - boxes_flow=None, - detect_vertical=False, - all_texts=False, - ) - text = extract_text(pdf_file=sample_pdf_file_like_object, laparams=laparams) - text = text.replace("\n", "") - # logger.info(text) - '''乱码文本用pdfminer提取出来的文本特征是(cid:xxx)''' - cid_pattern = re.compile(r'\(cid:\d+\)') - matches = cid_pattern.findall(text) - cid_count = len(matches) - cid_len = sum(len(match) for match in matches) - text_len = len(text) - if text_len == 0: - cid_chars_radio = 0 - else: - cid_chars_radio = cid_count/(cid_count + text_len - cid_len) - logger.info(f"cid_count: {cid_count}, text_len: {text_len}, cid_chars_radio: {cid_chars_radio}") - '''当一篇文章存在5%以上的文本是乱码时,认为该文档为乱码文档''' - if cid_chars_radio > 0.05: - return False # 乱码文档 - else: - return True # 正常文档 - - -def count_replacement_characters(text: str) -> int: - """ - 统计字符串中 0xfffd 字符的数量。 - """ - return text.count('\ufffd') - - -def detect_invalid_chars_by_pymupdf(src_pdf_bytes: bytes) -> bool: - sample_docs = extract_pages(src_pdf_bytes) - doc_text = "" - for page in sample_docs: - page_text = page.get_text('text', flags=fitz.TEXT_PRESERVE_WHITESPACE | fitz.TEXT_MEDIABOX_CLIP) - doc_text += page_text - text_len = len(doc_text) - uffd_count = count_replacement_characters(doc_text) - if text_len == 0: - uffd_chars_radio = 0 - else: - uffd_chars_radio = uffd_count / text_len - logger.info(f"uffd_count: {uffd_count}, text_len: {text_len}, uffd_chars_radio: {uffd_chars_radio}") - '''当一篇文章存在1%以上的文本是乱码时,认为该文档为乱码文档''' - if uffd_chars_radio > 0.01: - return False # 乱码文档 - else: - return True # 正常文档 \ No newline at end of file diff --git a/magic_pdf/libs/pdf_image_tools.py b/magic_pdf/libs/pdf_image_tools.py deleted file mode 100644 index 80201167da768f8f182c1d0eb2ae10771d96caa9..0000000000000000000000000000000000000000 --- a/magic_pdf/libs/pdf_image_tools.py +++ /dev/null @@ -1,63 +0,0 @@ -from io import BytesIO -import cv2 -import fitz -import numpy as np -from PIL import Image -from magic_pdf.data.data_reader_writer import DataWriter -from magic_pdf.libs.commons import join_path -from magic_pdf.libs.hash_utils import compute_sha256 - - -def cut_image(bbox: tuple, page_num: int, page: fitz.Page, return_path, imageWriter: DataWriter): - """从第page_num页的page中,根据bbox进行裁剪出一张jpg图片,返回图片路径 save_path:需要同时支持s3和本地, - 图片存放在save_path下,文件名是: - {page_num}_{bbox[0]}_{bbox[1]}_{bbox[2]}_{bbox[3]}.jpg , bbox内数字取整。""" - # 拼接文件名 - filename = f'{page_num}_{int(bbox[0])}_{int(bbox[1])}_{int(bbox[2])}_{int(bbox[3])}' - - # 老版本返回不带bucket的路径 - img_path = join_path(return_path, filename) if return_path is not None else None - - # 新版本生成平铺路径 - img_hash256_path = f'{compute_sha256(img_path)}.jpg' - - # 将坐标转换为fitz.Rect对象 - rect = fitz.Rect(*bbox) - # 配置缩放倍数为3倍 - zoom = fitz.Matrix(3, 3) - # 截取图片 - pix = page.get_pixmap(clip=rect, matrix=zoom) - - byte_data = pix.tobytes(output='jpeg', jpg_quality=95) - - imageWriter.write(img_hash256_path, byte_data) - - return img_hash256_path - - -def cut_image_to_pil_image(bbox: tuple, page: fitz.Page, mode="pillow"): - - # 将坐标转换为fitz.Rect对象 - rect = fitz.Rect(*bbox) - # 配置缩放倍数为3倍 - zoom = fitz.Matrix(3, 3) - # 截取图片 - pix = page.get_pixmap(clip=rect, matrix=zoom) - - if mode == "cv2": - # 直接转换为numpy数组供cv2使用 - img_array = np.frombuffer(pix.samples, dtype=np.uint8).reshape(pix.height, pix.width, pix.n) - # PyMuPDF使用RGB顺序,而cv2使用BGR顺序 - if pix.n == 3 or pix.n == 4: - image_result = cv2.cvtColor(img_array, cv2.COLOR_RGB2BGR) - else: - image_result = img_array - elif mode == "pillow": - # 将字节数据转换为文件对象 - image_file = BytesIO(pix.tobytes(output='png')) - # 使用 Pillow 打开图像 - image_result = Image.open(image_file) - else: - raise ValueError(f"mode: {mode} is not supported.") - - return image_result \ No newline at end of file diff --git a/magic_pdf/libs/performance_stats.py b/magic_pdf/libs/performance_stats.py deleted file mode 100644 index 3aeaeb33cb6832c35fea5520a78cf31626c4270c..0000000000000000000000000000000000000000 --- a/magic_pdf/libs/performance_stats.py +++ /dev/null @@ -1,65 +0,0 @@ -import time -import functools -from collections import defaultdict -from typing import Dict, List - - -class PerformanceStats: - """性能统计类,用于收集和展示方法执行时间""" - - _stats: Dict[str, List[float]] = defaultdict(list) - - @classmethod - def add_execution_time(cls, func_name: str, execution_time: float): - """添加执行时间记录""" - cls._stats[func_name].append(execution_time) - - @classmethod - def get_stats(cls) -> Dict[str, dict]: - """获取统计结果""" - results = {} - for func_name, times in cls._stats.items(): - results[func_name] = { - 'count': len(times), - 'total_time': sum(times), - 'avg_time': sum(times) / len(times), - 'min_time': min(times), - 'max_time': max(times) - } - return results - - @classmethod - def print_stats(cls): - """打印统计结果""" - stats = cls.get_stats() - print("\n性能统计结果:") - print("-" * 80) - print(f"{'方法名':<40} {'调用次数':>8} {'总时间(s)':>12} {'平均时间(s)':>12}") - print("-" * 80) - for func_name, data in stats.items(): - print(f"{func_name:<40} {data['count']:8d} {data['total_time']:12.6f} {data['avg_time']:12.6f}") - - -def measure_time(func): - """测量方法执行时间的装饰器""" - - @functools.wraps(func) - def wrapper(*args, **kwargs): - start_time = time.time() - result = func(*args, **kwargs) - execution_time = time.time() - start_time - - # 获取更详细的函数标识 - if hasattr(func, "__self__"): # 实例方法 - class_name = func.__self__.__class__.__name__ - full_name = f"{class_name}.{func.__name__}" - elif hasattr(func, "__qualname__"): # 类方法或静态方法 - full_name = func.__qualname__ - else: - module_name = func.__module__ - full_name = f"{module_name}.{func.__name__}" - - PerformanceStats.add_execution_time(full_name, execution_time) - return result - - return wrapper \ No newline at end of file diff --git a/magic_pdf/libs/safe_filename.py b/magic_pdf/libs/safe_filename.py deleted file mode 100644 index 1076a4bae218e180351ef2ec4692f156e03be1c7..0000000000000000000000000000000000000000 --- a/magic_pdf/libs/safe_filename.py +++ /dev/null @@ -1,11 +0,0 @@ -import os - - -def sanitize_filename(filename, replacement="_"): - if os.name == 'nt': - invalid_chars = '<>:"|?*' - - for char in invalid_chars: - filename = filename.replace(char, replacement) - - return filename diff --git a/magic_pdf/libs/version.py b/magic_pdf/libs/version.py deleted file mode 100644 index c45d9dbf3a2fb0a83065d719614b463df244d2b3..0000000000000000000000000000000000000000 --- a/magic_pdf/libs/version.py +++ /dev/null @@ -1 +0,0 @@ -__version__ = "1.3.12" diff --git a/magic_pdf/model/__init__.py b/magic_pdf/model/__init__.py deleted file mode 100644 index 859d01b33457ba56047073fdfefb9ef718cfa236..0000000000000000000000000000000000000000 --- a/magic_pdf/model/__init__.py +++ /dev/null @@ -1,2 +0,0 @@ -__use_inside_model__ = True -__model_mode__ = 'full' \ No newline at end of file diff --git a/magic_pdf/model/batch_analyze.py b/magic_pdf/model/batch_analyze.py deleted file mode 100644 index be5e331fd801433fea2f41de317c9e1424649b00..0000000000000000000000000000000000000000 --- a/magic_pdf/model/batch_analyze.py +++ /dev/null @@ -1,265 +0,0 @@ -import time -import cv2 -from loguru import logger -from tqdm import tqdm - -from magic_pdf.config.constants import MODEL_NAME -from magic_pdf.model.sub_modules.model_init import AtomModelSingleton -from magic_pdf.model.sub_modules.model_utils import ( - clean_vram, crop_img, get_res_list_from_layout_res, get_coords_and_area) -from magic_pdf.model.sub_modules.ocr.paddleocr2pytorch.ocr_utils import ( - get_adjusted_mfdetrec_res, get_ocr_result_list) - -YOLO_LAYOUT_BASE_BATCH_SIZE = 1 -MFD_BASE_BATCH_SIZE = 1 -MFR_BASE_BATCH_SIZE = 16 - - -class BatchAnalyze: - def __init__(self, model_manager, batch_ratio: int, show_log, layout_model, formula_enable, table_enable): - self.model_manager = model_manager - self.batch_ratio = batch_ratio - self.show_log = show_log - self.layout_model = layout_model - self.formula_enable = formula_enable - self.table_enable = table_enable - - def __call__(self, images_with_extra_info: list) -> list: - if len(images_with_extra_info) == 0: - return [] - - images_layout_res = [] - layout_start_time = time.time() - self.model = self.model_manager.get_model( - ocr=True, - show_log=self.show_log, - lang = None, - layout_model = self.layout_model, - formula_enable = self.formula_enable, - table_enable = self.table_enable, - ) - - images = [image for image, _, _ in images_with_extra_info] - - if self.model.layout_model_name == MODEL_NAME.LAYOUTLMv3: - # layoutlmv3 - for image in images: - layout_res = self.model.layout_model(image, ignore_catids=[]) - images_layout_res.append(layout_res) - elif self.model.layout_model_name == MODEL_NAME.DocLayout_YOLO: - # doclayout_yolo - layout_images = [] - for image_index, image in enumerate(images): - layout_images.append(image) - - images_layout_res += self.model.layout_model.batch_predict( - # layout_images, self.batch_ratio * YOLO_LAYOUT_BASE_BATCH_SIZE - layout_images, YOLO_LAYOUT_BASE_BATCH_SIZE - ) - - # logger.info( - # f'layout time: {round(time.time() - layout_start_time, 2)}, image num: {len(images)}' - # ) - - if self.model.apply_formula: - # 公式检测 - mfd_start_time = time.time() - images_mfd_res = self.model.mfd_model.batch_predict( - # images, self.batch_ratio * MFD_BASE_BATCH_SIZE - images, MFD_BASE_BATCH_SIZE - ) - # logger.info( - # f'mfd time: {round(time.time() - mfd_start_time, 2)}, image num: {len(images)}' - # ) - - # 公式识别 - mfr_start_time = time.time() - images_formula_list = self.model.mfr_model.batch_predict( - images_mfd_res, - images, - batch_size=self.batch_ratio * MFR_BASE_BATCH_SIZE, - ) - mfr_count = 0 - for image_index in range(len(images)): - images_layout_res[image_index] += images_formula_list[image_index] - mfr_count += len(images_formula_list[image_index]) - # logger.info( - # f'mfr time: {round(time.time() - mfr_start_time, 2)}, image num: {mfr_count}' - # ) - - # 清理显存 - # clean_vram(self.model.device, vram_threshold=8) - - ocr_res_list_all_page = [] - table_res_list_all_page = [] - for index in range(len(images)): - _, ocr_enable, _lang = images_with_extra_info[index] - layout_res = images_layout_res[index] - np_array_img = images[index] - - ocr_res_list, table_res_list, single_page_mfdetrec_res = ( - get_res_list_from_layout_res(layout_res) - ) - - ocr_res_list_all_page.append({'ocr_res_list':ocr_res_list, - 'lang':_lang, - 'ocr_enable':ocr_enable, - 'np_array_img':np_array_img, - 'single_page_mfdetrec_res':single_page_mfdetrec_res, - 'layout_res':layout_res, - }) - - for table_res in table_res_list: - table_img, _ = crop_img(table_res, np_array_img) - table_res_list_all_page.append({'table_res':table_res, - 'lang':_lang, - 'table_img':table_img, - }) - - # 文本框检测 - det_start = time.time() - det_count = 0 - # for ocr_res_list_dict in ocr_res_list_all_page: - for ocr_res_list_dict in tqdm(ocr_res_list_all_page, desc="OCR-det Predict"): - # Process each area that requires OCR processing - _lang = ocr_res_list_dict['lang'] - # Get OCR results for this language's images - atom_model_manager = AtomModelSingleton() - ocr_model = atom_model_manager.get_atom_model( - atom_model_name='ocr', - ocr_show_log=False, - det_db_box_thresh=0.3, - lang=_lang - ) - for res in ocr_res_list_dict['ocr_res_list']: - new_image, useful_list = crop_img( - res, ocr_res_list_dict['np_array_img'], crop_paste_x=50, crop_paste_y=50 - ) - adjusted_mfdetrec_res = get_adjusted_mfdetrec_res( - ocr_res_list_dict['single_page_mfdetrec_res'], useful_list - ) - - # OCR-det - new_image = cv2.cvtColor(new_image, cv2.COLOR_RGB2BGR) - ocr_res = ocr_model.ocr( - new_image, mfd_res=adjusted_mfdetrec_res, rec=False - )[0] - - # Integration results - if ocr_res: - ocr_result_list = get_ocr_result_list(ocr_res, useful_list, ocr_res_list_dict['ocr_enable'], new_image, _lang) - - if res["category_id"] == 3: - # ocr_result_list中所有bbox的面积之和 - ocr_res_area = sum(get_coords_and_area(ocr_res_item)[4] for ocr_res_item in ocr_result_list if 'poly' in ocr_res_item) - # 求ocr_res_area和res的面积的比值 - res_area = get_coords_and_area(res)[4] - if res_area > 0: - ratio = ocr_res_area / res_area - if ratio > 0.25: - res["category_id"] = 1 - else: - continue - - ocr_res_list_dict['layout_res'].extend(ocr_result_list) - - # det_count += len(ocr_res_list_dict['ocr_res_list']) - # logger.info(f'ocr-det time: {round(time.time()-det_start, 2)}, image num: {det_count}') - - - # 表格识别 table recognition - if self.model.apply_table: - table_start = time.time() - # for table_res_list_dict in table_res_list_all_page: - for table_res_dict in tqdm(table_res_list_all_page, desc="Table Predict"): - _lang = table_res_dict['lang'] - atom_model_manager = AtomModelSingleton() - table_model = atom_model_manager.get_atom_model( - atom_model_name='table', - table_model_name='rapid_table', - table_model_path='', - table_max_time=400, - device='cpu', - lang=_lang, - table_sub_model_name='slanet_plus' - ) - html_code, table_cell_bboxes, logic_points, elapse = table_model.predict(table_res_dict['table_img']) - # 判断是否返回正常 - if html_code: - expected_ending = html_code.strip().endswith( - '' - ) or html_code.strip().endswith('') - if expected_ending: - table_res_dict['table_res']['html'] = html_code - else: - logger.warning( - 'table recognition processing fails, not found expected HTML table end' - ) - else: - logger.warning( - 'table recognition processing fails, not get html return' - ) - # logger.info(f'table time: {round(time.time() - table_start, 2)}, image num: {len(table_res_list_all_page)}') - - # Create dictionaries to store items by language - need_ocr_lists_by_lang = {} # Dict of lists for each language - img_crop_lists_by_lang = {} # Dict of lists for each language - - for layout_res in images_layout_res: - for layout_res_item in layout_res: - if layout_res_item['category_id'] in [15]: - if 'np_img' in layout_res_item and 'lang' in layout_res_item: - lang = layout_res_item['lang'] - - # Initialize lists for this language if not exist - if lang not in need_ocr_lists_by_lang: - need_ocr_lists_by_lang[lang] = [] - img_crop_lists_by_lang[lang] = [] - - # Add to the appropriate language-specific lists - need_ocr_lists_by_lang[lang].append(layout_res_item) - img_crop_lists_by_lang[lang].append(layout_res_item['np_img']) - - # Remove the fields after adding to lists - layout_res_item.pop('np_img') - layout_res_item.pop('lang') - - - if len(img_crop_lists_by_lang) > 0: - - # Process OCR by language - rec_time = 0 - rec_start = time.time() - total_processed = 0 - - # Process each language separately - for lang, img_crop_list in img_crop_lists_by_lang.items(): - if len(img_crop_list) > 0: - # Get OCR results for this language's images - atom_model_manager = AtomModelSingleton() - ocr_model = atom_model_manager.get_atom_model( - atom_model_name='ocr', - ocr_show_log=False, - det_db_box_thresh=0.3, - lang=lang - ) - ocr_res_list = ocr_model.ocr(img_crop_list, det=False, tqdm_enable=True)[0] - - # Verify we have matching counts - assert len(ocr_res_list) == len( - need_ocr_lists_by_lang[lang]), f'ocr_res_list: {len(ocr_res_list)}, need_ocr_list: {len(need_ocr_lists_by_lang[lang])} for lang: {lang}' - - # Process OCR results for this language - for index, layout_res_item in enumerate(need_ocr_lists_by_lang[lang]): - ocr_text, ocr_score = ocr_res_list[index] - layout_res_item['text'] = ocr_text - layout_res_item['score'] = float(f"{ocr_score:.3f}") - - total_processed += len(img_crop_list) - - rec_time += time.time() - rec_start - # logger.info(f'ocr-rec time: {round(rec_time, 2)}, total images processed: {total_processed}') - - - - return images_layout_res diff --git a/magic_pdf/model/doc_analyze_by_custom_model.py b/magic_pdf/model/doc_analyze_by_custom_model.py deleted file mode 100644 index 93eecc6892f0986823d0a84693c96145dc8b9fea..0000000000000000000000000000000000000000 --- a/magic_pdf/model/doc_analyze_by_custom_model.py +++ /dev/null @@ -1,301 +0,0 @@ -import os -import time - -import numpy as np -import torch - -os.environ['FLAGS_npu_jit_compile'] = '0' # 关闭paddle的jit编译 -os.environ['FLAGS_use_stride_kernel'] = '0' -os.environ['PYTORCH_ENABLE_MPS_FALLBACK'] = '1' # 让mps可以fallback -os.environ['NO_ALBUMENTATIONS_UPDATE'] = '1' # 禁止albumentations检查更新 - - -from loguru import logger - -from magic_pdf.model.sub_modules.model_utils import get_vram -from magic_pdf.config.enums import SupportedPdfParseMethod -import magic_pdf.model as model_config -from magic_pdf.data.dataset import Dataset -from magic_pdf.libs.clean_memory import clean_memory -from magic_pdf.libs.config_reader import (get_device, get_formula_config, - get_layout_config, - get_local_models_dir, - get_table_recog_config) -from magic_pdf.model.model_list import MODEL - -class ModelSingleton: - _instance = None - _models = {} - - def __new__(cls, *args, **kwargs): - if cls._instance is None: - cls._instance = super().__new__(cls) - return cls._instance - - def get_model( - self, - ocr: bool, - show_log: bool, - lang=None, - layout_model=None, - formula_enable=None, - table_enable=None, - ): - key = (ocr, show_log, lang, layout_model, formula_enable, table_enable) - if key not in self._models: - self._models[key] = custom_model_init( - ocr=ocr, - show_log=show_log, - lang=lang, - layout_model=layout_model, - formula_enable=formula_enable, - table_enable=table_enable, - ) - return self._models[key] - - -def custom_model_init( - ocr: bool = False, - show_log: bool = False, - lang=None, - layout_model=None, - formula_enable=None, - table_enable=None, -): - model = None - if model_config.__model_mode__ == 'lite': - logger.warning( - 'The Lite mode is provided for developers to conduct testing only, and the output quality is ' - 'not guaranteed to be reliable.' - ) - model = MODEL.Paddle - elif model_config.__model_mode__ == 'full': - model = MODEL.PEK - - if model_config.__use_inside_model__: - model_init_start = time.time() - if model == MODEL.Paddle: - from magic_pdf.model.pp_structure_v2 import CustomPaddleModel - - custom_model = CustomPaddleModel(ocr=ocr, show_log=show_log, lang=lang) - elif model == MODEL.PEK: - from magic_pdf.model.pdf_extract_kit import CustomPEKModel - - # 从配置文件读取model-dir和device - local_models_dir = get_local_models_dir() - device = get_device() - - layout_config = get_layout_config() - if layout_model is not None: - layout_config['model'] = layout_model - - formula_config = get_formula_config() - if formula_enable is not None: - formula_config['enable'] = formula_enable - - table_config = get_table_recog_config() - if table_enable is not None: - table_config['enable'] = table_enable - - model_input = { - 'ocr': ocr, - 'show_log': show_log, - 'models_dir': local_models_dir, - 'device': device, - 'table_config': table_config, - 'layout_config': layout_config, - 'formula_config': formula_config, - 'lang': lang, - } - - custom_model = CustomPEKModel(**model_input) - else: - logger.error('Not allow model_name!') - exit(1) - model_init_cost = time.time() - model_init_start - logger.info(f'model init cost: {model_init_cost}') - else: - logger.error('use_inside_model is False, not allow to use inside model') - exit(1) - - return custom_model - -def doc_analyze( - dataset: Dataset, - ocr: bool = False, - show_log: bool = False, - start_page_id=0, - end_page_id=None, - lang=None, - layout_model=None, - formula_enable=None, - table_enable=None, -): - end_page_id = ( - end_page_id - if end_page_id is not None and end_page_id >= 0 - else len(dataset) - 1 - ) - - MIN_BATCH_INFERENCE_SIZE = int(os.environ.get('MINERU_MIN_BATCH_INFERENCE_SIZE', 200)) - images = [] - page_wh_list = [] - for index in range(len(dataset)): - if start_page_id <= index <= end_page_id: - page_data = dataset.get_page(index) - img_dict = page_data.get_image() - images.append(img_dict['img']) - page_wh_list.append((img_dict['width'], img_dict['height'])) - - images_with_extra_info = [(images[index], ocr, dataset._lang) for index in range(len(images))] - - if len(images) >= MIN_BATCH_INFERENCE_SIZE: - batch_size = MIN_BATCH_INFERENCE_SIZE - batch_images = [images_with_extra_info[i:i+batch_size] for i in range(0, len(images_with_extra_info), batch_size)] - else: - batch_images = [images_with_extra_info] - - results = [] - processed_images_count = 0 - for index, batch_image in enumerate(batch_images): - processed_images_count += len(batch_image) - logger.info(f'Batch {index + 1}/{len(batch_images)}: {processed_images_count} pages/{len(images_with_extra_info)} pages') - result = may_batch_image_analyze(batch_image, ocr, show_log,layout_model, formula_enable, table_enable) - results.extend(result) - - model_json = [] - for index in range(len(dataset)): - if start_page_id <= index <= end_page_id: - result = results.pop(0) - page_width, page_height = page_wh_list.pop(0) - else: - result = [] - page_height = 0 - page_width = 0 - - page_info = {'page_no': index, 'width': page_width, 'height': page_height} - page_dict = {'layout_dets': result, 'page_info': page_info} - model_json.append(page_dict) - - from magic_pdf.operators.models import InferenceResult - return InferenceResult(model_json, dataset) - -def batch_doc_analyze( - datasets: list[Dataset], - parse_method: str = 'auto', - show_log: bool = False, - lang=None, - layout_model=None, - formula_enable=None, - table_enable=None, -): - MIN_BATCH_INFERENCE_SIZE = int(os.environ.get('MINERU_MIN_BATCH_INFERENCE_SIZE', 100)) - batch_size = MIN_BATCH_INFERENCE_SIZE - page_wh_list = [] - - images_with_extra_info = [] - for dataset in datasets: - - ocr = False - if parse_method == 'auto': - if dataset.classify() == SupportedPdfParseMethod.TXT: - ocr = False - elif dataset.classify() == SupportedPdfParseMethod.OCR: - ocr = True - elif parse_method == 'ocr': - ocr = True - elif parse_method == 'txt': - ocr = False - - _lang = dataset._lang - - for index in range(len(dataset)): - page_data = dataset.get_page(index) - img_dict = page_data.get_image() - page_wh_list.append((img_dict['width'], img_dict['height'])) - images_with_extra_info.append((img_dict['img'], ocr, _lang)) - - batch_images = [images_with_extra_info[i:i+batch_size] for i in range(0, len(images_with_extra_info), batch_size)] - results = [] - processed_images_count = 0 - for index, batch_image in enumerate(batch_images): - processed_images_count += len(batch_image) - logger.info(f'Batch {index + 1}/{len(batch_images)}: {processed_images_count} pages/{len(images_with_extra_info)} pages') - result = may_batch_image_analyze(batch_image, True, show_log, layout_model, formula_enable, table_enable) - results.extend(result) - - infer_results = [] - from magic_pdf.operators.models import InferenceResult - for index in range(len(datasets)): - dataset = datasets[index] - model_json = [] - for i in range(len(dataset)): - result = results.pop(0) - page_width, page_height = page_wh_list.pop(0) - page_info = {'page_no': i, 'width': page_width, 'height': page_height} - page_dict = {'layout_dets': result, 'page_info': page_info} - model_json.append(page_dict) - infer_results.append(InferenceResult(model_json, dataset)) - return infer_results - - -def may_batch_image_analyze( - images_with_extra_info: list[(np.ndarray, bool, str)], - ocr: bool, - show_log: bool = False, - layout_model=None, - formula_enable=None, - table_enable=None): - # os.environ['CUDA_VISIBLE_DEVICES'] = str(idx) - - from magic_pdf.model.batch_analyze import BatchAnalyze - - model_manager = ModelSingleton() - - # images = [image for image, _, _ in images_with_extra_info] - batch_ratio = 1 - device = get_device() - - if str(device).startswith('npu'): - import torch_npu - if torch_npu.npu.is_available(): - torch.npu.set_compile_mode(jit_compile=False) - - if str(device).startswith('npu') or str(device).startswith('cuda'): - vram = get_vram(device) - if vram is not None: - gpu_memory = int(os.getenv('VIRTUAL_VRAM_SIZE', round(vram))) - if gpu_memory >= 16: - batch_ratio = 16 - elif gpu_memory >= 12: - batch_ratio = 8 - elif gpu_memory >= 8: - batch_ratio = 4 - elif gpu_memory >= 6: - batch_ratio = 2 - else: - batch_ratio = 1 - logger.info(f'gpu_memory: {gpu_memory} GB, batch_ratio: {batch_ratio}') - else: - # Default batch_ratio when VRAM can't be determined - batch_ratio = 1 - logger.info(f'Could not determine GPU memory, using default batch_ratio: {batch_ratio}') - - - # doc_analyze_start = time.time() - - batch_model = BatchAnalyze(model_manager, batch_ratio, show_log, layout_model, formula_enable, table_enable) - results = batch_model(images_with_extra_info) - - # gc_start = time.time() - clean_memory(get_device()) - # gc_time = round(time.time() - gc_start, 2) - # logger.debug(f'gc time: {gc_time}') - - # doc_analyze_time = round(time.time() - doc_analyze_start, 2) - # doc_analyze_speed = round(len(images) / doc_analyze_time, 2) - # logger.debug( - # f'doc analyze time: {round(time.time() - doc_analyze_start, 2)},' - # f' speed: {doc_analyze_speed} pages/second' - # ) - return results \ No newline at end of file diff --git a/magic_pdf/model/magic_model.py b/magic_pdf/model/magic_model.py deleted file mode 100644 index b5922d35cf9622685bde3478d872476ca63d7487..0000000000000000000000000000000000000000 --- a/magic_pdf/model/magic_model.py +++ /dev/null @@ -1,771 +0,0 @@ -import enum - -from magic_pdf.config.model_block_type import ModelBlockTypeEnum -from magic_pdf.config.ocr_content_type import CategoryId, ContentType -from magic_pdf.data.dataset import Dataset -from magic_pdf.libs.boxbase import (_is_in, bbox_distance, bbox_relative_pos, - calculate_iou) -from magic_pdf.libs.coordinate_transform import get_scale_ratio -from magic_pdf.pre_proc.remove_bbox_overlap import _remove_overlap_between_bbox - -CAPATION_OVERLAP_AREA_RATIO = 0.6 -MERGE_BOX_OVERLAP_AREA_RATIO = 1.1 - - -class PosRelationEnum(enum.Enum): - LEFT = 'left' - RIGHT = 'right' - UP = 'up' - BOTTOM = 'bottom' - ALL = 'all' - - -class MagicModel: - """每个函数没有得到元素的时候返回空list.""" - - def __fix_axis(self): - for model_page_info in self.__model_list: - need_remove_list = [] - page_no = model_page_info['page_info']['page_no'] - horizontal_scale_ratio, vertical_scale_ratio = get_scale_ratio( - model_page_info, self.__docs.get_page(page_no) - ) - layout_dets = model_page_info['layout_dets'] - for layout_det in layout_dets: - - if layout_det.get('bbox') is not None: - # 兼容直接输出bbox的模型数据,如paddle - x0, y0, x1, y1 = layout_det['bbox'] - else: - # 兼容直接输出poly的模型数据,如xxx - x0, y0, _, _, x1, y1, _, _ = layout_det['poly'] - - bbox = [ - int(x0 / horizontal_scale_ratio), - int(y0 / vertical_scale_ratio), - int(x1 / horizontal_scale_ratio), - int(y1 / vertical_scale_ratio), - ] - layout_det['bbox'] = bbox - # 删除高度或者宽度小于等于0的spans - if bbox[2] - bbox[0] <= 0 or bbox[3] - bbox[1] <= 0: - need_remove_list.append(layout_det) - for need_remove in need_remove_list: - layout_dets.remove(need_remove) - - def __fix_by_remove_low_confidence(self): - for model_page_info in self.__model_list: - need_remove_list = [] - layout_dets = model_page_info['layout_dets'] - for layout_det in layout_dets: - if layout_det['score'] <= 0.05: - need_remove_list.append(layout_det) - else: - continue - for need_remove in need_remove_list: - layout_dets.remove(need_remove) - - def __fix_by_remove_high_iou_and_low_confidence(self): - for model_page_info in self.__model_list: - need_remove_list = [] - layout_dets = model_page_info['layout_dets'] - for layout_det1 in layout_dets: - for layout_det2 in layout_dets: - if layout_det1 == layout_det2: - continue - if layout_det1['category_id'] in [ - 0, - 1, - 2, - 3, - 4, - 5, - 6, - 7, - 8, - 9, - ] and layout_det2['category_id'] in [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]: - if ( - calculate_iou(layout_det1['bbox'], layout_det2['bbox']) - > 0.9 - ): - if layout_det1['score'] < layout_det2['score']: - layout_det_need_remove = layout_det1 - else: - layout_det_need_remove = layout_det2 - - if layout_det_need_remove not in need_remove_list: - need_remove_list.append(layout_det_need_remove) - else: - continue - else: - continue - for need_remove in need_remove_list: - layout_dets.remove(need_remove) - - def __init__(self, model_list: list, docs: Dataset): - self.__model_list = model_list - self.__docs = docs - """为所有模型数据添加bbox信息(缩放,poly->bbox)""" - self.__fix_axis() - """删除置信度特别低的模型数据(<0.05),提高质量""" - self.__fix_by_remove_low_confidence() - """删除高iou(>0.9)数据中置信度较低的那个""" - self.__fix_by_remove_high_iou_and_low_confidence() - self.__fix_footnote() - - def _bbox_distance(self, bbox1, bbox2): - left, right, bottom, top = bbox_relative_pos(bbox1, bbox2) - flags = [left, right, bottom, top] - count = sum([1 if v else 0 for v in flags]) - if count > 1: - return float('inf') - if left or right: - l1 = bbox1[3] - bbox1[1] - l2 = bbox2[3] - bbox2[1] - else: - l1 = bbox1[2] - bbox1[0] - l2 = bbox2[2] - bbox2[0] - - if l2 > l1 and (l2 - l1) / l1 > 0.3: - return float('inf') - - return bbox_distance(bbox1, bbox2) - - def __fix_footnote(self): - # 3: figure, 5: table, 7: footnote - for model_page_info in self.__model_list: - footnotes = [] - figures = [] - tables = [] - - for obj in model_page_info['layout_dets']: - if obj['category_id'] == 7: - footnotes.append(obj) - elif obj['category_id'] == 3: - figures.append(obj) - elif obj['category_id'] == 5: - tables.append(obj) - if len(footnotes) * len(figures) == 0: - continue - dis_figure_footnote = {} - dis_table_footnote = {} - - for i in range(len(footnotes)): - for j in range(len(figures)): - pos_flag_count = sum( - list( - map( - lambda x: 1 if x else 0, - bbox_relative_pos( - footnotes[i]['bbox'], figures[j]['bbox'] - ), - ) - ) - ) - if pos_flag_count > 1: - continue - dis_figure_footnote[i] = min( - self._bbox_distance(figures[j]['bbox'], footnotes[i]['bbox']), - dis_figure_footnote.get(i, float('inf')), - ) - for i in range(len(footnotes)): - for j in range(len(tables)): - pos_flag_count = sum( - list( - map( - lambda x: 1 if x else 0, - bbox_relative_pos( - footnotes[i]['bbox'], tables[j]['bbox'] - ), - ) - ) - ) - if pos_flag_count > 1: - continue - - dis_table_footnote[i] = min( - self._bbox_distance(tables[j]['bbox'], footnotes[i]['bbox']), - dis_table_footnote.get(i, float('inf')), - ) - for i in range(len(footnotes)): - if i not in dis_figure_footnote: - continue - if dis_table_footnote.get(i, float('inf')) > dis_figure_footnote[i]: - footnotes[i]['category_id'] = CategoryId.ImageFootnote - - def __reduct_overlap(self, bboxes): - N = len(bboxes) - keep = [True] * N - for i in range(N): - for j in range(N): - if i == j: - continue - if _is_in(bboxes[i]['bbox'], bboxes[j]['bbox']): - keep[i] = False - return [bboxes[i] for i in range(N) if keep[i]] - - def __tie_up_category_by_distance_v2( - self, - page_no: int, - subject_category_id: int, - object_category_id: int, - priority_pos: PosRelationEnum, - ): - """_summary_ - - Args: - page_no (int): _description_ - subject_category_id (int): _description_ - object_category_id (int): _description_ - priority_pos (PosRelationEnum): _description_ - - Returns: - _type_: _description_ - """ - AXIS_MULPLICITY = 0.5 - subjects = self.__reduct_overlap( - list( - map( - lambda x: {'bbox': x['bbox'], 'score': x['score']}, - filter( - lambda x: x['category_id'] == subject_category_id, - self.__model_list[page_no]['layout_dets'], - ), - ) - ) - ) - - objects = self.__reduct_overlap( - list( - map( - lambda x: {'bbox': x['bbox'], 'score': x['score']}, - filter( - lambda x: x['category_id'] == object_category_id, - self.__model_list[page_no]['layout_dets'], - ), - ) - ) - ) - M = len(objects) - - subjects.sort(key=lambda x: x['bbox'][0] ** 2 + x['bbox'][1] ** 2) - objects.sort(key=lambda x: x['bbox'][0] ** 2 + x['bbox'][1] ** 2) - - sub_obj_map_h = {i: [] for i in range(len(subjects))} - - dis_by_directions = { - 'top': [[-1, float('inf')]] * M, - 'bottom': [[-1, float('inf')]] * M, - 'left': [[-1, float('inf')]] * M, - 'right': [[-1, float('inf')]] * M, - } - - for i, obj in enumerate(objects): - l_x_axis, l_y_axis = ( - obj['bbox'][2] - obj['bbox'][0], - obj['bbox'][3] - obj['bbox'][1], - ) - axis_unit = min(l_x_axis, l_y_axis) - for j, sub in enumerate(subjects): - - bbox1, bbox2, _ = _remove_overlap_between_bbox( - objects[i]['bbox'], subjects[j]['bbox'] - ) - left, right, bottom, top = bbox_relative_pos(bbox1, bbox2) - flags = [left, right, bottom, top] - if sum([1 if v else 0 for v in flags]) > 1: - continue - - if left: - if dis_by_directions['left'][i][1] > bbox_distance( - obj['bbox'], sub['bbox'] - ): - dis_by_directions['left'][i] = [ - j, - bbox_distance(obj['bbox'], sub['bbox']), - ] - if right: - if dis_by_directions['right'][i][1] > bbox_distance( - obj['bbox'], sub['bbox'] - ): - dis_by_directions['right'][i] = [ - j, - bbox_distance(obj['bbox'], sub['bbox']), - ] - if bottom: - if dis_by_directions['bottom'][i][1] > bbox_distance( - obj['bbox'], sub['bbox'] - ): - dis_by_directions['bottom'][i] = [ - j, - bbox_distance(obj['bbox'], sub['bbox']), - ] - if top: - if dis_by_directions['top'][i][1] > bbox_distance( - obj['bbox'], sub['bbox'] - ): - dis_by_directions['top'][i] = [ - j, - bbox_distance(obj['bbox'], sub['bbox']), - ] - - if ( - dis_by_directions['top'][i][1] != float('inf') - and dis_by_directions['bottom'][i][1] != float('inf') - and priority_pos in (PosRelationEnum.BOTTOM, PosRelationEnum.UP) - ): - RATIO = 3 - if ( - abs( - dis_by_directions['top'][i][1] - - dis_by_directions['bottom'][i][1] - ) - < RATIO * axis_unit - ): - - if priority_pos == PosRelationEnum.BOTTOM: - sub_obj_map_h[dis_by_directions['bottom'][i][0]].append(i) - else: - sub_obj_map_h[dis_by_directions['top'][i][0]].append(i) - continue - - if dis_by_directions['left'][i][1] != float('inf') or dis_by_directions[ - 'right' - ][i][1] != float('inf'): - if dis_by_directions['left'][i][1] != float( - 'inf' - ) and dis_by_directions['right'][i][1] != float('inf'): - if AXIS_MULPLICITY * axis_unit >= abs( - dis_by_directions['left'][i][1] - - dis_by_directions['right'][i][1] - ): - left_sub_bbox = subjects[dis_by_directions['left'][i][0]][ - 'bbox' - ] - right_sub_bbox = subjects[dis_by_directions['right'][i][0]][ - 'bbox' - ] - - left_sub_bbox_y_axis = left_sub_bbox[3] - left_sub_bbox[1] - right_sub_bbox_y_axis = right_sub_bbox[3] - right_sub_bbox[1] - - if ( - abs(left_sub_bbox_y_axis - l_y_axis) - + dis_by_directions['left'][i][0] - > abs(right_sub_bbox_y_axis - l_y_axis) - + dis_by_directions['right'][i][0] - ): - left_or_right = dis_by_directions['right'][i] - else: - left_or_right = dis_by_directions['left'][i] - else: - left_or_right = dis_by_directions['left'][i] - if left_or_right[1] > dis_by_directions['right'][i][1]: - left_or_right = dis_by_directions['right'][i] - else: - left_or_right = dis_by_directions['left'][i] - if left_or_right[1] == float('inf'): - left_or_right = dis_by_directions['right'][i] - else: - left_or_right = [-1, float('inf')] - - if dis_by_directions['top'][i][1] != float('inf') or dis_by_directions[ - 'bottom' - ][i][1] != float('inf'): - if dis_by_directions['top'][i][1] != float('inf') and dis_by_directions[ - 'bottom' - ][i][1] != float('inf'): - if AXIS_MULPLICITY * axis_unit >= abs( - dis_by_directions['top'][i][1] - - dis_by_directions['bottom'][i][1] - ): - top_bottom = subjects[dis_by_directions['bottom'][i][0]]['bbox'] - bottom_top = subjects[dis_by_directions['top'][i][0]]['bbox'] - - top_bottom_x_axis = top_bottom[2] - top_bottom[0] - bottom_top_x_axis = bottom_top[2] - bottom_top[0] - if ( - abs(top_bottom_x_axis - l_x_axis) - + dis_by_directions['bottom'][i][1] - > abs(bottom_top_x_axis - l_x_axis) - + dis_by_directions['top'][i][1] - ): - top_or_bottom = dis_by_directions['top'][i] - else: - top_or_bottom = dis_by_directions['bottom'][i] - else: - top_or_bottom = dis_by_directions['top'][i] - if top_or_bottom[1] > dis_by_directions['bottom'][i][1]: - top_or_bottom = dis_by_directions['bottom'][i] - else: - top_or_bottom = dis_by_directions['top'][i] - if top_or_bottom[1] == float('inf'): - top_or_bottom = dis_by_directions['bottom'][i] - else: - top_or_bottom = [-1, float('inf')] - - if left_or_right[1] != float('inf') or top_or_bottom[1] != float('inf'): - if left_or_right[1] != float('inf') and top_or_bottom[1] != float( - 'inf' - ): - if AXIS_MULPLICITY * axis_unit >= abs( - left_or_right[1] - top_or_bottom[1] - ): - y_axis_bbox = subjects[left_or_right[0]]['bbox'] - x_axis_bbox = subjects[top_or_bottom[0]]['bbox'] - - if ( - abs((x_axis_bbox[2] - x_axis_bbox[0]) - l_x_axis) / l_x_axis - > abs((y_axis_bbox[3] - y_axis_bbox[1]) - l_y_axis) - / l_y_axis - ): - sub_obj_map_h[left_or_right[0]].append(i) - else: - sub_obj_map_h[top_or_bottom[0]].append(i) - else: - if left_or_right[1] > top_or_bottom[1]: - sub_obj_map_h[top_or_bottom[0]].append(i) - else: - sub_obj_map_h[left_or_right[0]].append(i) - else: - if left_or_right[1] != float('inf'): - sub_obj_map_h[left_or_right[0]].append(i) - else: - sub_obj_map_h[top_or_bottom[0]].append(i) - ret = [] - for i in sub_obj_map_h.keys(): - ret.append( - { - 'sub_bbox': { - 'bbox': subjects[i]['bbox'], - 'score': subjects[i]['score'], - }, - 'obj_bboxes': [ - {'score': objects[j]['score'], 'bbox': objects[j]['bbox']} - for j in sub_obj_map_h[i] - ], - 'sub_idx': i, - } - ) - return ret - - - def __tie_up_category_by_distance_v3( - self, - page_no: int, - subject_category_id: int, - object_category_id: int, - priority_pos: PosRelationEnum, - ): - subjects = self.__reduct_overlap( - list( - map( - lambda x: {'bbox': x['bbox'], 'score': x['score']}, - filter( - lambda x: x['category_id'] == subject_category_id, - self.__model_list[page_no]['layout_dets'], - ), - ) - ) - ) - objects = self.__reduct_overlap( - list( - map( - lambda x: {'bbox': x['bbox'], 'score': x['score']}, - filter( - lambda x: x['category_id'] == object_category_id, - self.__model_list[page_no]['layout_dets'], - ), - ) - ) - ) - - ret = [] - N, M = len(subjects), len(objects) - subjects.sort(key=lambda x: x['bbox'][0] ** 2 + x['bbox'][1] ** 2) - objects.sort(key=lambda x: x['bbox'][0] ** 2 + x['bbox'][1] ** 2) - - OBJ_IDX_OFFSET = 10000 - SUB_BIT_KIND, OBJ_BIT_KIND = 0, 1 - - all_boxes_with_idx = [(i, SUB_BIT_KIND, sub['bbox'][0], sub['bbox'][1]) for i, sub in enumerate(subjects)] + [(i + OBJ_IDX_OFFSET , OBJ_BIT_KIND, obj['bbox'][0], obj['bbox'][1]) for i, obj in enumerate(objects)] - seen_idx = set() - seen_sub_idx = set() - - while N > len(seen_sub_idx): - candidates = [] - for idx, kind, x0, y0 in all_boxes_with_idx: - if idx in seen_idx: - continue - candidates.append((idx, kind, x0, y0)) - - if len(candidates) == 0: - break - left_x = min([v[2] for v in candidates]) - top_y = min([v[3] for v in candidates]) - - candidates.sort(key=lambda x: (x[2]-left_x) ** 2 + (x[3] - top_y) ** 2) - - - fst_idx, fst_kind, left_x, top_y = candidates[0] - candidates.sort(key=lambda x: (x[2] - left_x) ** 2 + (x[3] - top_y)**2) - nxt = None - - for i in range(1, len(candidates)): - if candidates[i][1] ^ fst_kind == 1: - nxt = candidates[i] - break - if nxt is None: - break - - if fst_kind == SUB_BIT_KIND: - sub_idx, obj_idx = fst_idx, nxt[0] - OBJ_IDX_OFFSET - - else: - sub_idx, obj_idx = nxt[0], fst_idx - OBJ_IDX_OFFSET - - pair_dis = bbox_distance(subjects[sub_idx]['bbox'], objects[obj_idx]['bbox']) - nearest_dis = float('inf') - for i in range(N): - if i in seen_idx or i == sub_idx:continue - nearest_dis = min(nearest_dis, bbox_distance(subjects[i]['bbox'], objects[obj_idx]['bbox'])) - - if pair_dis >= 3*nearest_dis: - seen_idx.add(sub_idx) - continue - - seen_idx.add(sub_idx) - seen_idx.add(obj_idx + OBJ_IDX_OFFSET) - seen_sub_idx.add(sub_idx) - - ret.append( - { - 'sub_bbox': { - 'bbox': subjects[sub_idx]['bbox'], - 'score': subjects[sub_idx]['score'], - }, - 'obj_bboxes': [ - {'score': objects[obj_idx]['score'], 'bbox': objects[obj_idx]['bbox']} - ], - 'sub_idx': sub_idx, - } - ) - - for i in range(len(objects)): - j = i + OBJ_IDX_OFFSET - if j in seen_idx: - continue - seen_idx.add(j) - nearest_dis, nearest_sub_idx = float('inf'), -1 - for k in range(len(subjects)): - dis = bbox_distance(objects[i]['bbox'], subjects[k]['bbox']) - if dis < nearest_dis: - nearest_dis = dis - nearest_sub_idx = k - - for k in range(len(subjects)): - if k != nearest_sub_idx: continue - if k in seen_sub_idx: - for kk in range(len(ret)): - if ret[kk]['sub_idx'] == k: - ret[kk]['obj_bboxes'].append({'score': objects[i]['score'], 'bbox': objects[i]['bbox']}) - break - else: - ret.append( - { - 'sub_bbox': { - 'bbox': subjects[k]['bbox'], - 'score': subjects[k]['score'], - }, - 'obj_bboxes': [ - {'score': objects[i]['score'], 'bbox': objects[i]['bbox']} - ], - 'sub_idx': k, - } - ) - seen_sub_idx.add(k) - seen_idx.add(k) - - - for i in range(len(subjects)): - if i in seen_sub_idx: - continue - ret.append( - { - 'sub_bbox': { - 'bbox': subjects[i]['bbox'], - 'score': subjects[i]['score'], - }, - 'obj_bboxes': [], - 'sub_idx': i, - } - ) - - - return ret - - - def get_imgs_v2(self, page_no: int): - with_captions = self.__tie_up_category_by_distance_v3( - page_no, 3, 4, PosRelationEnum.BOTTOM - ) - with_footnotes = self.__tie_up_category_by_distance_v3( - page_no, 3, CategoryId.ImageFootnote, PosRelationEnum.ALL - ) - ret = [] - for v in with_captions: - record = { - 'image_body': v['sub_bbox'], - 'image_caption_list': v['obj_bboxes'], - } - filter_idx = v['sub_idx'] - d = next(filter(lambda x: x['sub_idx'] == filter_idx, with_footnotes)) - record['image_footnote_list'] = d['obj_bboxes'] - ret.append(record) - return ret - - def get_tables_v2(self, page_no: int) -> list: - with_captions = self.__tie_up_category_by_distance_v3( - page_no, 5, 6, PosRelationEnum.UP - ) - with_footnotes = self.__tie_up_category_by_distance_v3( - page_no, 5, 7, PosRelationEnum.ALL - ) - ret = [] - for v in with_captions: - record = { - 'table_body': v['sub_bbox'], - 'table_caption_list': v['obj_bboxes'], - } - filter_idx = v['sub_idx'] - d = next(filter(lambda x: x['sub_idx'] == filter_idx, with_footnotes)) - record['table_footnote_list'] = d['obj_bboxes'] - ret.append(record) - return ret - - def get_imgs(self, page_no: int): - return self.get_imgs_v2(page_no) - - def get_tables( - self, page_no: int - ) -> list: # 3个坐标, caption, table主体,table-note - return self.get_tables_v2(page_no) - - def get_equations(self, page_no: int) -> list: # 有坐标,也有字 - inline_equations = self.__get_blocks_by_type( - ModelBlockTypeEnum.EMBEDDING.value, page_no, ['latex'] - ) - interline_equations = self.__get_blocks_by_type( - ModelBlockTypeEnum.ISOLATED.value, page_no, ['latex'] - ) - interline_equations_blocks = self.__get_blocks_by_type( - ModelBlockTypeEnum.ISOLATE_FORMULA.value, page_no - ) - return inline_equations, interline_equations, interline_equations_blocks - - def get_discarded(self, page_no: int) -> list: # 自研模型,只有坐标 - blocks = self.__get_blocks_by_type(ModelBlockTypeEnum.ABANDON.value, page_no) - return blocks - - def get_text_blocks(self, page_no: int) -> list: # 自研模型搞的,只有坐标,没有字 - blocks = self.__get_blocks_by_type(ModelBlockTypeEnum.PLAIN_TEXT.value, page_no) - return blocks - - def get_title_blocks(self, page_no: int) -> list: # 自研模型,只有坐标,没字 - blocks = self.__get_blocks_by_type(ModelBlockTypeEnum.TITLE.value, page_no) - return blocks - - def get_ocr_text(self, page_no: int) -> list: # paddle 搞的,有字也有坐标 - text_spans = [] - model_page_info = self.__model_list[page_no] - layout_dets = model_page_info['layout_dets'] - for layout_det in layout_dets: - if layout_det['category_id'] == '15': - span = { - 'bbox': layout_det['bbox'], - 'content': layout_det['text'], - } - text_spans.append(span) - return text_spans - - def get_all_spans(self, page_no: int) -> list: - - def remove_duplicate_spans(spans): - new_spans = [] - for span in spans: - if not any(span == existing_span for existing_span in new_spans): - new_spans.append(span) - return new_spans - - all_spans = [] - model_page_info = self.__model_list[page_no] - layout_dets = model_page_info['layout_dets'] - allow_category_id_list = [3, 5, 13, 14, 15] - """当成span拼接的""" - # 3: 'image', # 图片 - # 5: 'table', # 表格 - # 13: 'inline_equation', # 行内公式 - # 14: 'interline_equation', # 行间公式 - # 15: 'text', # ocr识别文本 - for layout_det in layout_dets: - category_id = layout_det['category_id'] - if category_id in allow_category_id_list: - span = {'bbox': layout_det['bbox'], 'score': layout_det['score']} - if category_id == 3: - span['type'] = ContentType.Image - elif category_id == 5: - # 获取table模型结果 - latex = layout_det.get('latex', None) - html = layout_det.get('html', None) - if latex: - span['latex'] = latex - elif html: - span['html'] = html - span['type'] = ContentType.Table - elif category_id == 13: - span['content'] = layout_det['latex'] - span['type'] = ContentType.InlineEquation - elif category_id == 14: - span['content'] = layout_det['latex'] - span['type'] = ContentType.InterlineEquation - elif category_id == 15: - span['content'] = layout_det['text'] - span['type'] = ContentType.Text - all_spans.append(span) - return remove_duplicate_spans(all_spans) - - def get_page_size(self, page_no: int): # 获取页面宽高 - # 获取当前页的page对象 - page = self.__docs.get_page(page_no).get_page_info() - # 获取当前页的宽高 - page_w = page.w - page_h = page.h - return page_w, page_h - - def __get_blocks_by_type( - self, type: int, page_no: int, extra_col: list[str] = [] - ) -> list: - blocks = [] - for page_dict in self.__model_list: - layout_dets = page_dict.get('layout_dets', []) - page_info = page_dict.get('page_info', {}) - page_number = page_info.get('page_no', -1) - if page_no != page_number: - continue - for item in layout_dets: - category_id = item.get('category_id', -1) - bbox = item.get('bbox', None) - - if category_id == type: - block = { - 'bbox': bbox, - 'score': item.get('score'), - } - for col in extra_col: - block[col] = item.get(col, None) - blocks.append(block) - return blocks - - def get_model_list(self, page_no): - return self.__model_list[page_no] diff --git a/magic_pdf/model/model_list.py b/magic_pdf/model/model_list.py deleted file mode 100644 index ec871d16981dc663d9ff635886ca847d6b16f6d5..0000000000000000000000000000000000000000 --- a/magic_pdf/model/model_list.py +++ /dev/null @@ -1,12 +0,0 @@ -class MODEL: - Paddle = "pp_structure_v2" - PEK = "pdf_extract_kit" - - -class AtomicModel: - Layout = "layout" - MFD = "mfd" - MFR = "mfr" - OCR = "ocr" - Table = "table" - LangDetect = "langdetect" diff --git a/magic_pdf/model/pdf_extract_kit.py b/magic_pdf/model/pdf_extract_kit.py deleted file mode 100644 index f389a306565e20ff00dfef7c784bce30151583f1..0000000000000000000000000000000000000000 --- a/magic_pdf/model/pdf_extract_kit.py +++ /dev/null @@ -1,266 +0,0 @@ -# flake8: noqa -import os -import time - -import cv2 -import torch -import yaml -from loguru import logger - -os.environ['NO_ALBUMENTATIONS_UPDATE'] = '1' # 禁止albumentations检查更新 - -from magic_pdf.config.constants import * -from magic_pdf.model.model_list import AtomicModel -from magic_pdf.model.sub_modules.model_init import AtomModelSingleton -from magic_pdf.model.sub_modules.model_utils import ( - clean_vram, crop_img, get_res_list_from_layout_res) -from magic_pdf.model.sub_modules.ocr.paddleocr2pytorch.ocr_utils import ( - get_adjusted_mfdetrec_res, get_ocr_result_list) - - -class CustomPEKModel: - - def __init__(self, ocr: bool = False, show_log: bool = False, **kwargs): - """ - ======== model init ======== - """ - # 获取当前文件(即 pdf_extract_kit.py)的绝对路径 - current_file_path = os.path.abspath(__file__) - # 获取当前文件所在的目录(model) - current_dir = os.path.dirname(current_file_path) - # 上一级目录(magic_pdf) - root_dir = os.path.dirname(current_dir) - # model_config目录 - model_config_dir = os.path.join(root_dir, 'resources', 'model_config') - # 构建 model_configs.yaml 文件的完整路径 - config_path = os.path.join(model_config_dir, 'model_configs.yaml') - with open(config_path, 'r', encoding='utf-8') as f: - self.configs = yaml.load(f, Loader=yaml.FullLoader) - # 初始化解析配置 - - # layout config - self.layout_config = kwargs.get('layout_config') - self.layout_model_name = self.layout_config.get( - 'model', MODEL_NAME.DocLayout_YOLO - ) - - # formula config - self.formula_config = kwargs.get('formula_config') - self.mfd_model_name = self.formula_config.get( - 'mfd_model', MODEL_NAME.YOLO_V8_MFD - ) - self.mfr_model_name = self.formula_config.get( - 'mfr_model', MODEL_NAME.UniMerNet_v2_Small - ) - self.apply_formula = self.formula_config.get('enable', True) - - # table config - self.table_config = kwargs.get('table_config') - self.apply_table = self.table_config.get('enable', False) - self.table_max_time = self.table_config.get('max_time', TABLE_MAX_TIME_VALUE) - self.table_model_name = self.table_config.get('model', MODEL_NAME.RAPID_TABLE) - self.table_sub_model_name = self.table_config.get('sub_model', None) - - # ocr config - self.apply_ocr = ocr - self.lang = kwargs.get('lang', None) - - logger.info( - 'DocAnalysis init, this may take some times, layout_model: {}, apply_formula: {}, apply_ocr: {}, ' - 'apply_table: {}, table_model: {}, lang: {}'.format( - self.layout_model_name, - self.apply_formula, - self.apply_ocr, - self.apply_table, - self.table_model_name, - self.lang, - ) - ) - # 初始化解析方案 - self.device = kwargs.get('device', 'cpu') - - logger.info('using device: {}'.format(self.device)) - models_dir = kwargs.get( - 'models_dir', os.path.join(root_dir, 'resources', 'models') - ) - logger.info('using models_dir: {}'.format(models_dir)) - - atom_model_manager = AtomModelSingleton() - - # 初始化公式识别 - if self.apply_formula: - # 初始化公式检测模型 - self.mfd_model = atom_model_manager.get_atom_model( - atom_model_name=AtomicModel.MFD, - mfd_weights=str( - os.path.join( - models_dir, self.configs['weights'][self.mfd_model_name] - ) - ), - device=self.device, - ) - - # 初始化公式解析模型 - mfr_weight_dir = str( - os.path.join(models_dir, self.configs['weights'][self.mfr_model_name]) - ) - mfr_cfg_path = str(os.path.join(model_config_dir, 'UniMERNet', 'demo.yaml')) - - self.mfr_model = atom_model_manager.get_atom_model( - atom_model_name=AtomicModel.MFR, - mfr_weight_dir=mfr_weight_dir, - mfr_cfg_path=mfr_cfg_path, - device=self.device, - ) - - # 初始化layout模型 - if self.layout_model_name == MODEL_NAME.LAYOUTLMv3: - self.layout_model = atom_model_manager.get_atom_model( - atom_model_name=AtomicModel.Layout, - layout_model_name=MODEL_NAME.LAYOUTLMv3, - layout_weights=str( - os.path.join( - models_dir, self.configs['weights'][self.layout_model_name] - ) - ), - layout_config_file=str( - os.path.join( - model_config_dir, 'layoutlmv3', 'layoutlmv3_base_inference.yaml' - ) - ), - device='cpu' if str(self.device).startswith("mps") else self.device, - ) - elif self.layout_model_name == MODEL_NAME.DocLayout_YOLO: - self.layout_model = atom_model_manager.get_atom_model( - atom_model_name=AtomicModel.Layout, - layout_model_name=MODEL_NAME.DocLayout_YOLO, - doclayout_yolo_weights=str( - os.path.join( - models_dir, self.configs['weights'][self.layout_model_name] - ) - ), - device=self.device, - ) - # 初始化ocr - self.ocr_model = atom_model_manager.get_atom_model( - atom_model_name=AtomicModel.OCR, - ocr_show_log=show_log, - det_db_box_thresh=0.3, - lang=self.lang - ) - # init table model - if self.apply_table: - table_model_dir = self.configs['weights'][self.table_model_name] - self.table_model = atom_model_manager.get_atom_model( - atom_model_name=AtomicModel.Table, - table_model_name=self.table_model_name, - table_model_path=str(os.path.join(models_dir, table_model_dir)), - table_max_time=self.table_max_time, - device=self.device, - ocr_engine=self.ocr_model, - table_sub_model_name=self.table_sub_model_name - ) - - logger.info('DocAnalysis init done!') - - def __call__(self, image): - # layout检测 - layout_start = time.time() - layout_res = [] - if self.layout_model_name == MODEL_NAME.LAYOUTLMv3: - # layoutlmv3 - layout_res = self.layout_model(image, ignore_catids=[]) - elif self.layout_model_name == MODEL_NAME.DocLayout_YOLO: - layout_res = self.layout_model.predict(image) - - layout_cost = round(time.time() - layout_start, 2) - logger.info(f'layout detection time: {layout_cost}') - - if self.apply_formula: - # 公式检测 - mfd_start = time.time() - mfd_res = self.mfd_model.predict(image) - logger.info(f'mfd time: {round(time.time() - mfd_start, 2)}') - - # 公式识别 - mfr_start = time.time() - formula_list = self.mfr_model.predict(mfd_res, image) - layout_res.extend(formula_list) - mfr_cost = round(time.time() - mfr_start, 2) - logger.info(f'formula nums: {len(formula_list)}, mfr time: {mfr_cost}') - - # 清理显存 - clean_vram(self.device, vram_threshold=6) - - # 从layout_res中获取ocr区域、表格区域、公式区域 - ocr_res_list, table_res_list, single_page_mfdetrec_res = ( - get_res_list_from_layout_res(layout_res) - ) - - # ocr识别 - ocr_start = time.time() - # Process each area that requires OCR processing - for res in ocr_res_list: - new_image, useful_list = crop_img(res, image, crop_paste_x=50, crop_paste_y=50) - adjusted_mfdetrec_res = get_adjusted_mfdetrec_res(single_page_mfdetrec_res, useful_list) - - # OCR recognition - new_image = cv2.cvtColor(new_image, cv2.COLOR_RGB2BGR) - - if self.apply_ocr: - ocr_res = self.ocr_model.ocr(new_image, mfd_res=adjusted_mfdetrec_res)[0] - else: - ocr_res = self.ocr_model.ocr(new_image, mfd_res=adjusted_mfdetrec_res, rec=False)[0] - - # Integration results - if ocr_res: - ocr_result_list = get_ocr_result_list(ocr_res, useful_list) - layout_res.extend(ocr_result_list) - - ocr_cost = round(time.time() - ocr_start, 2) - if self.apply_ocr: - logger.info(f"ocr time: {ocr_cost}") - else: - logger.info(f"det time: {ocr_cost}") - - # 表格识别 table recognition - if self.apply_table: - table_start = time.time() - for res in table_res_list: - new_image, _ = crop_img(res, image) - single_table_start_time = time.time() - html_code = None - if self.table_model_name == MODEL_NAME.STRUCT_EQTABLE: - with torch.no_grad(): - table_result = self.table_model.predict(new_image, 'html') - if len(table_result) > 0: - html_code = table_result[0] - elif self.table_model_name == MODEL_NAME.TABLE_MASTER: - html_code = self.table_model.img2html(new_image) - elif self.table_model_name == MODEL_NAME.RAPID_TABLE: - html_code, table_cell_bboxes, logic_points, elapse = self.table_model.predict( - new_image - ) - run_time = time.time() - single_table_start_time - if run_time > self.table_max_time: - logger.warning( - f'table recognition processing exceeds max time {self.table_max_time}s' - ) - # 判断是否返回正常 - if html_code: - expected_ending = html_code.strip().endswith( - '' - ) or html_code.strip().endswith('') - if expected_ending: - res['html'] = html_code - else: - logger.warning( - 'table recognition processing fails, not found expected HTML table end' - ) - else: - logger.warning( - 'table recognition processing fails, not get html return' - ) - logger.info(f'table time: {round(time.time() - table_start, 2)}') - - return layout_res diff --git a/magic_pdf/model/pp_structure_v2.py b/magic_pdf/model/pp_structure_v2.py deleted file mode 100644 index ad9f71ad47d0dc9513b4913f4b156e1ab7fb65b2..0000000000000000000000000000000000000000 --- a/magic_pdf/model/pp_structure_v2.py +++ /dev/null @@ -1,110 +0,0 @@ -import random - -from loguru import logger - -try: - from paddleocr import PPStructure -except ImportError: - logger.error('paddleocr not installed, please install by "pip install magic-pdf[lite]"') - exit(1) - - -def region_to_bbox(region): - x0 = region[0][0] - y0 = region[0][1] - x1 = region[2][0] - y1 = region[2][1] - return [x0, y0, x1, y1] - - -class CustomPaddleModel: - def __init__(self, - ocr: bool = False, - show_log: bool = False, - lang=None, - det_db_box_thresh=0.3, - use_dilation=True, - det_db_unclip_ratio=1.8 - ): - if lang is not None: - self.model = PPStructure(table=False, - ocr=True, - show_log=show_log, - lang=lang, - det_db_box_thresh=det_db_box_thresh, - use_dilation=use_dilation, - det_db_unclip_ratio=det_db_unclip_ratio, - ) - else: - self.model = PPStructure(table=False, - ocr=True, - show_log=show_log, - det_db_box_thresh=det_db_box_thresh, - use_dilation=use_dilation, - det_db_unclip_ratio=det_db_unclip_ratio, - ) - - def __call__(self, img): - try: - import cv2 - except ImportError: - logger.error("opencv-python not installed, please install by pip.") - exit(1) - # 将RGB图片转换为BGR格式适配paddle - img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR) - result = self.model(img) - spans = [] - for line in result: - line.pop("img") - """ - 为paddle输出适配type no. - title: 0 # 标题 - text: 1 # 文本 - header: 2 # abandon - footer: 2 # abandon - reference: 1 # 文本 or abandon - equation: 8 # 行间公式 block - equation: 14 # 行间公式 text - figure: 3 # 图片 - figure_caption: 4 # 图片描述 - table: 5 # 表格 - table_caption: 6 # 表格描述 - """ - if line["type"] == "title": - line["category_id"] = 0 - elif line["type"] in ["text", "reference"]: - line["category_id"] = 1 - elif line["type"] == "figure": - line["category_id"] = 3 - elif line["type"] == "figure_caption": - line["category_id"] = 4 - elif line["type"] == "table": - line["category_id"] = 5 - elif line["type"] == "table_caption": - line["category_id"] = 6 - elif line["type"] == "equation": - line["category_id"] = 8 - elif line["type"] in ["header", "footer"]: - line["category_id"] = 2 - else: - logger.warning(f"unknown type: {line['type']}") - - # 兼容不输出score的paddleocr版本 - if line.get("score") is None: - line["score"] = 0.5 + random.random() * 0.5 - - res = line.pop("res", None) - if res is not None and len(res) > 0: - for span in res: - new_span = { - "category_id": 15, - "bbox": region_to_bbox(span["text_region"]), - "score": span["confidence"], - "text": span["text"], - } - spans.append(new_span) - - if len(spans) > 0: - result.extend(spans) - - return result diff --git a/magic_pdf/model/sub_modules/__init__.py b/magic_pdf/model/sub_modules/__init__.py deleted file mode 100644 index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000 diff --git a/magic_pdf/model/sub_modules/language_detection/utils.py b/magic_pdf/model/sub_modules/language_detection/utils.py deleted file mode 100644 index 20aefaf6a870edb0247e094cd583314d35bbb5d2..0000000000000000000000000000000000000000 --- a/magic_pdf/model/sub_modules/language_detection/utils.py +++ /dev/null @@ -1,80 +0,0 @@ -# Copyright (c) Opendatalab. All rights reserved. -import os -from pathlib import Path - -import yaml -os.environ['NO_ALBUMENTATIONS_UPDATE'] = '1' # 禁止albumentations检查更新 - -from magic_pdf.config.constants import MODEL_NAME -from magic_pdf.data.utils import load_images_from_pdf -from magic_pdf.libs.config_reader import get_local_models_dir, get_device -from magic_pdf.libs.pdf_check import extract_pages -from magic_pdf.model.model_list import AtomicModel -from magic_pdf.model.sub_modules.model_init import AtomModelSingleton - - -def get_model_config(): - local_models_dir = get_local_models_dir() - device = get_device() - current_file_path = os.path.abspath(__file__) - root_dir = Path(current_file_path).parents[3] - model_config_dir = os.path.join(root_dir, 'resources', 'model_config') - config_path = os.path.join(model_config_dir, 'model_configs.yaml') - with open(config_path, 'r', encoding='utf-8') as f: - configs = yaml.load(f, Loader=yaml.FullLoader) - return root_dir, local_models_dir, device, configs - - -def get_text_images(simple_images): - _, local_models_dir, device, configs = get_model_config() - atom_model_manager = AtomModelSingleton() - temp_layout_model = atom_model_manager.get_atom_model( - atom_model_name=AtomicModel.Layout, - layout_model_name=MODEL_NAME.DocLayout_YOLO, - doclayout_yolo_weights=str( - os.path.join( - local_models_dir, configs['weights'][MODEL_NAME.DocLayout_YOLO] - ) - ), - device=device, - ) - text_images = [] - for simple_image in simple_images: - image = simple_image['img'] - layout_res = temp_layout_model.predict(image) - # 给textblock截图 - for res in layout_res: - if res['category_id'] in [1]: - x1, y1, _, _, x2, y2, _, _ = res['poly'] - # 初步清洗(宽和高都小于100) - if x2 - x1 < 100 and y2 - y1 < 100: - continue - text_images.append(image[y1:y2, x1:x2]) - return text_images - - -def auto_detect_lang(pdf_bytes: bytes): - sample_docs = extract_pages(pdf_bytes) - sample_pdf_bytes = sample_docs.tobytes() - simple_images = load_images_from_pdf(sample_pdf_bytes, dpi=200) - text_images = get_text_images(simple_images) - langdetect_model = model_init(MODEL_NAME.YOLO_V11_LangDetect) - lang = langdetect_model.do_detect(text_images) - return lang - - -def model_init(model_name: str): - atom_model_manager = AtomModelSingleton() - - if model_name == MODEL_NAME.YOLO_V11_LangDetect: - root_dir, _, device, _ = get_model_config() - model = atom_model_manager.get_atom_model( - atom_model_name=AtomicModel.LangDetect, - langdetect_model_name=MODEL_NAME.YOLO_V11_LangDetect, - langdetect_model_weight=str(os.path.join(root_dir, 'resources', 'yolov11-langdetect', 'yolo_v11_ft.pt')), - device=device, - ) - else: - raise ValueError(f"model_name {model_name} not found") - return model - diff --git a/magic_pdf/model/sub_modules/language_detection/yolov11/YOLOv11.py b/magic_pdf/model/sub_modules/language_detection/yolov11/YOLOv11.py deleted file mode 100644 index 28cdb17cd0f06aa8edfd3037e680edacec63a90e..0000000000000000000000000000000000000000 --- a/magic_pdf/model/sub_modules/language_detection/yolov11/YOLOv11.py +++ /dev/null @@ -1,145 +0,0 @@ -# Copyright (c) Opendatalab. All rights reserved. -import time -from collections import Counter -from uuid import uuid4 -import cv2 -import numpy as np -import torch -from loguru import logger -from ultralytics import YOLO - -language_dict = { - "ch": "中文简体", - "en": "英语", - "japan": "日语", - "korean": "韩语", - "fr": "法语", - "german": "德语", - "ar": "阿拉伯语", - "ru": "俄语" -} - - -def split_images(image, result_images=None): - """ - 对输入文件夹内的图片进行处理,若图片竖向(y方向)分辨率超过400,则进行拆分, - 每次平分图片,直至拆分出的图片竖向分辨率都满足400以下,将处理后的图片(拆分后的子图片)保存到输出文件夹。 - 避免保存因裁剪区域超出图片范围导致出现的无效黑色图片部分。 - """ - if result_images is None: - result_images = [] - - height, width = image.shape[:2] - long_side = max(width, height) # 获取较长边长度 - - if long_side <= 400: - result_images.append(image) - return result_images - - new_long_side = long_side // 2 - sub_images = [] - - if width >= height: # 如果宽度是较长边 - for x in range(0, width, new_long_side): - # 判断裁剪区域是否超出图片范围,如果超出则不进行裁剪保存操作 - if x + new_long_side > width: - continue - sub_image = image[0:height, x:x + new_long_side] - sub_images.append(sub_image) - else: # 如果高度是较长边 - for y in range(0, height, new_long_side): - # 判断裁剪区域是否超出图片范围,如果超出则不进行裁剪保存操作 - if y + new_long_side > height: - continue - sub_image = image[y:y + new_long_side, 0:width] - sub_images.append(sub_image) - - for sub_image in sub_images: - split_images(sub_image, result_images) - - return result_images - - -def resize_images_to_224(image): - """ - 若分辨率小于224则用黑色背景补齐到224*224大小,若大于等于224则调整为224*224大小。 - Works directly with NumPy arrays. - """ - try: - height, width = image.shape[:2] - - if width < 224 or height < 224: - # Create black background - new_image = np.zeros((224, 224, 3), dtype=np.uint8) - # Calculate paste position (ensure they're not negative) - paste_x = max(0, (224 - width) // 2) - paste_y = max(0, (224 - height) // 2) - # Make sure we don't exceed the boundaries of new_image - paste_width = min(width, 224) - paste_height = min(height, 224) - # Paste original image onto black background - new_image[paste_y:paste_y + paste_height, paste_x:paste_x + paste_width] = image[:paste_height, :paste_width] - image = new_image - else: - # Resize using cv2 - image = cv2.resize(image, (224, 224), interpolation=cv2.INTER_LANCZOS4) - - return image - except Exception as e: - logger.exception(f"Error in resize_images_to_224: {e}") - return None - - -class YOLOv11LangDetModel(object): - def __init__(self, langdetect_model_weight, device): - - self.model = YOLO(langdetect_model_weight) - - if str(device).startswith("npu"): - self.device = torch.device(device) - else: - self.device = device - def do_detect(self, images: list): - all_images = [] - for image in images: - height, width = image.shape[:2] - if width < 100 and height < 100: - continue - temp_images = split_images(image) - for temp_image in temp_images: - all_images.append(resize_images_to_224(temp_image)) - # langdetect_start = time.time() - images_lang_res = self.batch_predict(all_images, batch_size=256) - # logger.info(f"image number of langdetect: {len(images_lang_res)}, langdetect time: {round(time.time() - langdetect_start, 2)}") - if len(images_lang_res) > 0: - count_dict = Counter(images_lang_res) - language = max(count_dict, key=count_dict.get) - else: - language = None - return language - - def predict(self, image): - results = self.model.predict(image, verbose=False, device=self.device) - predicted_class_id = int(results[0].probs.top1) - predicted_class_name = self.model.names[predicted_class_id] - return predicted_class_name - - - def batch_predict(self, images: list, batch_size: int) -> list: - images_lang_res = [] - - for index in range(0, len(images), batch_size): - lang_res = [ - image_res.cpu() - for image_res in self.model.predict( - images[index: index + batch_size], - verbose = False, - device=self.device, - ) - ] - for res in lang_res: - predicted_class_id = int(res.probs.top1) - predicted_class_name = self.model.names[predicted_class_id] - images_lang_res.append(predicted_class_name) - - return images_lang_res \ No newline at end of file diff --git a/magic_pdf/model/sub_modules/layout/__init__.py b/magic_pdf/model/sub_modules/layout/__init__.py deleted file mode 100644 index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000 diff --git a/magic_pdf/model/sub_modules/layout/doclayout_yolo/DocLayoutYOLO.py b/magic_pdf/model/sub_modules/layout/doclayout_yolo/DocLayoutYOLO.py deleted file mode 100644 index 2c7a23a37e7e0dc19f364db743e6947e8349fe59..0000000000000000000000000000000000000000 --- a/magic_pdf/model/sub_modules/layout/doclayout_yolo/DocLayoutYOLO.py +++ /dev/null @@ -1,64 +0,0 @@ -from doclayout_yolo import YOLOv10 -from tqdm import tqdm - - -class DocLayoutYOLOModel(object): - def __init__(self, weight, device): - self.model = YOLOv10(weight) - self.device = device - - def predict(self, image): - layout_res = [] - doclayout_yolo_res = self.model.predict( - image, - imgsz=1280, - conf=0.10, - iou=0.45, - verbose=False, device=self.device - )[0] - for xyxy, conf, cla in zip( - doclayout_yolo_res.boxes.xyxy.cpu(), - doclayout_yolo_res.boxes.conf.cpu(), - doclayout_yolo_res.boxes.cls.cpu(), - ): - xmin, ymin, xmax, ymax = [int(p.item()) for p in xyxy] - new_item = { - "category_id": int(cla.item()), - "poly": [xmin, ymin, xmax, ymin, xmax, ymax, xmin, ymax], - "score": round(float(conf.item()), 3), - } - layout_res.append(new_item) - return layout_res - - def batch_predict(self, images: list, batch_size: int) -> list: - images_layout_res = [] - # for index in range(0, len(images), batch_size): - for index in tqdm(range(0, len(images), batch_size), desc="Layout Predict"): - doclayout_yolo_res = [ - image_res.cpu() - for image_res in self.model.predict( - images[index : index + batch_size], - imgsz=1280, - conf=0.10, - iou=0.45, - verbose=False, - device=self.device, - ) - ] - for image_res in doclayout_yolo_res: - layout_res = [] - for xyxy, conf, cla in zip( - image_res.boxes.xyxy, - image_res.boxes.conf, - image_res.boxes.cls, - ): - xmin, ymin, xmax, ymax = [int(p.item()) for p in xyxy] - new_item = { - "category_id": int(cla.item()), - "poly": [xmin, ymin, xmax, ymin, xmax, ymax, xmin, ymax], - "score": round(float(conf.item()), 3), - } - layout_res.append(new_item) - images_layout_res.append(layout_res) - - return images_layout_res diff --git a/magic_pdf/model/sub_modules/layout/doclayout_yolo/__init__.py b/magic_pdf/model/sub_modules/layout/doclayout_yolo/__init__.py deleted file mode 100644 index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000 diff --git a/magic_pdf/model/sub_modules/layout/layoutlmv3/__init__.py b/magic_pdf/model/sub_modules/layout/layoutlmv3/__init__.py deleted file mode 100644 index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000 diff --git a/magic_pdf/model/sub_modules/layout/layoutlmv3/backbone.py b/magic_pdf/model/sub_modules/layout/layoutlmv3/backbone.py deleted file mode 100644 index 5364f862e78205c65ffe3fdeba6aef09da148c39..0000000000000000000000000000000000000000 --- a/magic_pdf/model/sub_modules/layout/layoutlmv3/backbone.py +++ /dev/null @@ -1,179 +0,0 @@ -# -------------------------------------------------------------------------------- -# VIT: Multi-Path Vision Transformer for Dense Prediction -# Copyright (c) 2022 Electronics and Telecommunications Research Institute (ETRI). -# All Rights Reserved. -# Written by Youngwan Lee -# This source code is licensed(Dual License(GPL3.0 & Commercial)) under the license found in the -# LICENSE file in the root directory of this source tree. -# -------------------------------------------------------------------------------- -# References: -# timm: https://github.com/rwightman/pytorch-image-models/tree/master/timm -# CoaT: https://github.com/mlpc-ucsd/CoaT -# -------------------------------------------------------------------------------- - - -import torch - -from detectron2.layers import ( - ShapeSpec, -) -from detectron2.modeling import Backbone, BACKBONE_REGISTRY, FPN -from detectron2.modeling.backbone.fpn import LastLevelP6P7, LastLevelMaxPool - -from .beit import beit_base_patch16, dit_base_patch16, dit_large_patch16, beit_large_patch16 -from .deit import deit_base_patch16, mae_base_patch16 -from .layoutlmft.models.layoutlmv3 import LayoutLMv3Model -from transformers import AutoConfig - -__all__ = [ - "build_vit_fpn_backbone", -] - - -class VIT_Backbone(Backbone): - """ - Implement VIT backbone. - """ - - def __init__(self, name, out_features, drop_path, img_size, pos_type, model_kwargs, - config_path=None, image_only=False, cfg=None): - super().__init__() - self._out_features = out_features - if 'base' in name: - self._out_feature_strides = {"layer3": 4, "layer5": 8, "layer7": 16, "layer11": 32} - self._out_feature_channels = {"layer3": 768, "layer5": 768, "layer7": 768, "layer11": 768} - else: - self._out_feature_strides = {"layer7": 4, "layer11": 8, "layer15": 16, "layer23": 32} - self._out_feature_channels = {"layer7": 1024, "layer11": 1024, "layer15": 1024, "layer23": 1024} - - if name == 'beit_base_patch16': - model_func = beit_base_patch16 - elif name == 'dit_base_patch16': - model_func = dit_base_patch16 - elif name == "deit_base_patch16": - model_func = deit_base_patch16 - elif name == "mae_base_patch16": - model_func = mae_base_patch16 - elif name == "dit_large_patch16": - model_func = dit_large_patch16 - elif name == "beit_large_patch16": - model_func = beit_large_patch16 - - if 'beit' in name or 'dit' in name: - if pos_type == "abs": - self.backbone = model_func(img_size=img_size, - out_features=out_features, - drop_path_rate=drop_path, - use_abs_pos_emb=True, - **model_kwargs) - elif pos_type == "shared_rel": - self.backbone = model_func(img_size=img_size, - out_features=out_features, - drop_path_rate=drop_path, - use_shared_rel_pos_bias=True, - **model_kwargs) - elif pos_type == "rel": - self.backbone = model_func(img_size=img_size, - out_features=out_features, - drop_path_rate=drop_path, - use_rel_pos_bias=True, - **model_kwargs) - else: - raise ValueError() - elif "layoutlmv3" in name: - config = AutoConfig.from_pretrained(config_path) - # disable relative bias as DiT - config.has_spatial_attention_bias = False - config.has_relative_attention_bias = False - self.backbone = LayoutLMv3Model(config, detection=True, - out_features=out_features, image_only=image_only) - else: - self.backbone = model_func(img_size=img_size, - out_features=out_features, - drop_path_rate=drop_path, - **model_kwargs) - self.name = name - - def forward(self, x): - """ - Args: - x: Tensor of shape (N,C,H,W). H, W must be a multiple of ``self.size_divisibility``. - - Returns: - dict[str->Tensor]: names and the corresponding features - """ - if "layoutlmv3" in self.name: - return self.backbone.forward( - input_ids=x["input_ids"] if "input_ids" in x else None, - bbox=x["bbox"] if "bbox" in x else None, - images=x["images"] if "images" in x else None, - attention_mask=x["attention_mask"] if "attention_mask" in x else None, - # output_hidden_states=True, - ) - assert x.dim() == 4, f"VIT takes an input of shape (N, C, H, W). Got {x.shape} instead!" - return self.backbone.forward_features(x) - - def output_shape(self): - return { - name: ShapeSpec( - channels=self._out_feature_channels[name], stride=self._out_feature_strides[name] - ) - for name in self._out_features - } - - -def build_VIT_backbone(cfg): - """ - Create a VIT instance from config. - - Args: - cfg: a detectron2 CfgNode - - Returns: - A VIT backbone instance. - """ - # fmt: off - name = cfg.MODEL.VIT.NAME - out_features = cfg.MODEL.VIT.OUT_FEATURES - drop_path = cfg.MODEL.VIT.DROP_PATH - img_size = cfg.MODEL.VIT.IMG_SIZE - pos_type = cfg.MODEL.VIT.POS_TYPE - - model_kwargs = eval(str(cfg.MODEL.VIT.MODEL_KWARGS).replace("`", "")) - - if 'layoutlmv3' in name: - if cfg.MODEL.CONFIG_PATH != '': - config_path = cfg.MODEL.CONFIG_PATH - else: - config_path = cfg.MODEL.WEIGHTS.replace('pytorch_model.bin', '') # layoutlmv3 pre-trained models - config_path = config_path.replace('model_final.pth', '') # detection fine-tuned models - else: - config_path = None - - return VIT_Backbone(name, out_features, drop_path, img_size, pos_type, model_kwargs, - config_path=config_path, image_only=cfg.MODEL.IMAGE_ONLY, cfg=cfg) - - -@BACKBONE_REGISTRY.register() -def build_vit_fpn_backbone(cfg, input_shape: ShapeSpec): - """ - Create a VIT w/ FPN backbone. - - Args: - cfg: a detectron2 CfgNode - - Returns: - backbone (Backbone): backbone module, must be a subclass of :class:`Backbone`. - """ - bottom_up = build_VIT_backbone(cfg) - in_features = cfg.MODEL.FPN.IN_FEATURES - out_channels = cfg.MODEL.FPN.OUT_CHANNELS - backbone = FPN( - bottom_up=bottom_up, - in_features=in_features, - out_channels=out_channels, - norm=cfg.MODEL.FPN.NORM, - top_block=LastLevelMaxPool(), - fuse_type=cfg.MODEL.FPN.FUSE_TYPE, - ) - return backbone diff --git a/magic_pdf/model/sub_modules/layout/layoutlmv3/beit.py b/magic_pdf/model/sub_modules/layout/layoutlmv3/beit.py deleted file mode 100644 index 03d4fabdc7816f19a8810e3c443643bc9e53e6b9..0000000000000000000000000000000000000000 --- a/magic_pdf/model/sub_modules/layout/layoutlmv3/beit.py +++ /dev/null @@ -1,671 +0,0 @@ -""" Vision Transformer (ViT) in PyTorch - -A PyTorch implement of Vision Transformers as described in -'An Image Is Worth 16 x 16 Words: Transformers for Image Recognition at Scale' - https://arxiv.org/abs/2010.11929 - -The official jax code is released and available at https://github.com/google-research/vision_transformer - -Status/TODO: -* Models updated to be compatible with official impl. Args added to support backward compat for old PyTorch weights. -* Weights ported from official jax impl for 384x384 base and small models, 16x16 and 32x32 patches. -* Trained (supervised on ImageNet-1k) my custom 'small' patch model to 77.9, 'base' to 79.4 top-1 with this code. -* Hopefully find time and GPUs for SSL or unsupervised pretraining on OpenImages w/ ImageNet fine-tune in future. - -Acknowledgments: -* The paper authors for releasing code and weights, thanks! -* I fixed my class token impl based on Phil Wang's https://github.com/lucidrains/vit-pytorch ... check it out -for some einops/einsum fun -* Simple transformer style inspired by Andrej Karpathy's https://github.com/karpathy/minGPT -* Bert reference code checks against Huggingface Transformers and Tensorflow Bert - -Hacked together by / Copyright 2020 Ross Wightman -""" -import warnings -import math -import torch -from functools import partial -import torch.nn as nn -import torch.nn.functional as F -import torch.utils.checkpoint as checkpoint -from timm.models.layers import drop_path, to_2tuple, trunc_normal_ - - -def _cfg(url='', **kwargs): - return { - 'url': url, - 'num_classes': 1000, 'input_size': (3, 224, 224), 'pool_size': None, - 'crop_pct': .9, 'interpolation': 'bicubic', - 'mean': (0.5, 0.5, 0.5), 'std': (0.5, 0.5, 0.5), - **kwargs - } - - -class DropPath(nn.Module): - """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks). - """ - - def __init__(self, drop_prob=None): - super(DropPath, self).__init__() - self.drop_prob = drop_prob - - def forward(self, x): - return drop_path(x, self.drop_prob, self.training) - - def extra_repr(self) -> str: - return 'p={}'.format(self.drop_prob) - - -class Mlp(nn.Module): - def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, drop=0.): - super().__init__() - out_features = out_features or in_features - hidden_features = hidden_features or in_features - self.fc1 = nn.Linear(in_features, hidden_features) - self.act = act_layer() - self.fc2 = nn.Linear(hidden_features, out_features) - self.drop = nn.Dropout(drop) - - def forward(self, x): - x = self.fc1(x) - x = self.act(x) - # x = self.drop(x) - # commit this for the orignal BERT implement - x = self.fc2(x) - x = self.drop(x) - return x - - -class Attention(nn.Module): - def __init__( - self, dim, num_heads=8, qkv_bias=False, qk_scale=None, attn_drop=0., - proj_drop=0., window_size=None, attn_head_dim=None): - super().__init__() - self.num_heads = num_heads - head_dim = dim // num_heads - if attn_head_dim is not None: - head_dim = attn_head_dim - all_head_dim = head_dim * self.num_heads - # NOTE scale factor was wrong in my original version, can set manually to be compat with prev weights - self.scale = qk_scale or head_dim ** -0.5 - - self.qkv = nn.Linear(dim, all_head_dim * 3, bias=False) - if qkv_bias: - self.q_bias = nn.Parameter(torch.zeros(all_head_dim)) - self.v_bias = nn.Parameter(torch.zeros(all_head_dim)) - else: - self.q_bias = None - self.v_bias = None - - if window_size: - self.window_size = window_size - self.num_relative_distance = (2 * window_size[0] - 1) * (2 * window_size[1] - 1) + 3 - self.relative_position_bias_table = nn.Parameter( - torch.zeros(self.num_relative_distance, num_heads)) # 2*Wh-1 * 2*Ww-1, nH - # cls to token & token 2 cls & cls to cls - - # get pair-wise relative position index for each token inside the window - coords_h = torch.arange(window_size[0]) - coords_w = torch.arange(window_size[1]) - coords = torch.stack(torch.meshgrid([coords_h, coords_w])) # 2, Wh, Ww - coords_flatten = torch.flatten(coords, 1) # 2, Wh*Ww - relative_coords = coords_flatten[:, :, None] - coords_flatten[:, None, :] # 2, Wh*Ww, Wh*Ww - relative_coords = relative_coords.permute(1, 2, 0).contiguous() # Wh*Ww, Wh*Ww, 2 - relative_coords[:, :, 0] += window_size[0] - 1 # shift to start from 0 - relative_coords[:, :, 1] += window_size[1] - 1 - relative_coords[:, :, 0] *= 2 * window_size[1] - 1 - relative_position_index = \ - torch.zeros(size=(window_size[0] * window_size[1] + 1,) * 2, dtype=relative_coords.dtype) - relative_position_index[1:, 1:] = relative_coords.sum(-1) # Wh*Ww, Wh*Ww - relative_position_index[0, 0:] = self.num_relative_distance - 3 - relative_position_index[0:, 0] = self.num_relative_distance - 2 - relative_position_index[0, 0] = self.num_relative_distance - 1 - - self.register_buffer("relative_position_index", relative_position_index) - - # trunc_normal_(self.relative_position_bias_table, std=.0) - else: - self.window_size = None - self.relative_position_bias_table = None - self.relative_position_index = None - - self.attn_drop = nn.Dropout(attn_drop) - self.proj = nn.Linear(all_head_dim, dim) - self.proj_drop = nn.Dropout(proj_drop) - - def forward(self, x, rel_pos_bias=None, training_window_size=None): - B, N, C = x.shape - qkv_bias = None - if self.q_bias is not None: - qkv_bias = torch.cat((self.q_bias, torch.zeros_like(self.v_bias, requires_grad=False), self.v_bias)) - # qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4) - qkv = F.linear(input=x, weight=self.qkv.weight, bias=qkv_bias) - qkv = qkv.reshape(B, N, 3, self.num_heads, -1).permute(2, 0, 3, 1, 4) - q, k, v = qkv[0], qkv[1], qkv[2] # make torchscript happy (cannot use tensor as tuple) - - q = q * self.scale - attn = (q @ k.transpose(-2, -1)) - - if self.relative_position_bias_table is not None: - if training_window_size == self.window_size: - relative_position_bias = \ - self.relative_position_bias_table[self.relative_position_index.view(-1)].view( - self.window_size[0] * self.window_size[1] + 1, - self.window_size[0] * self.window_size[1] + 1, -1) # Wh*Ww,Wh*Ww,nH - relative_position_bias = relative_position_bias.permute(2, 0, 1).contiguous() # nH, Wh*Ww, Wh*Ww - attn = attn + relative_position_bias.unsqueeze(0) - else: - training_window_size = tuple(training_window_size.tolist()) - new_num_relative_distance = (2 * training_window_size[0] - 1) * (2 * training_window_size[1] - 1) + 3 - # new_num_relative_dis 为 所有可能的相对位置选项,包含cls-cls,tok-cls,与cls-tok - new_relative_position_bias_table = F.interpolate( - self.relative_position_bias_table[:-3, :].permute(1, 0).view(1, self.num_heads, - 2 * self.window_size[0] - 1, - 2 * self.window_size[1] - 1), - size=(2 * training_window_size[0] - 1, 2 * training_window_size[1] - 1), mode='bicubic', - align_corners=False) - new_relative_position_bias_table = new_relative_position_bias_table.view(self.num_heads, - new_num_relative_distance - 3).permute( - 1, 0) - new_relative_position_bias_table = torch.cat( - [new_relative_position_bias_table, self.relative_position_bias_table[-3::]], dim=0) - - # get pair-wise relative position index for each token inside the window - coords_h = torch.arange(training_window_size[0]) - coords_w = torch.arange(training_window_size[1]) - coords = torch.stack(torch.meshgrid([coords_h, coords_w])) # 2, Wh, Ww - coords_flatten = torch.flatten(coords, 1) # 2, Wh*Ww - relative_coords = coords_flatten[:, :, None] - coords_flatten[:, None, :] # 2, Wh*Ww, Wh*Ww - relative_coords = relative_coords.permute(1, 2, 0).contiguous() # Wh*Ww, Wh*Ww, 2 - relative_coords[:, :, 0] += training_window_size[0] - 1 # shift to start from 0 - relative_coords[:, :, 1] += training_window_size[1] - 1 - relative_coords[:, :, 0] *= 2 * training_window_size[1] - 1 - relative_position_index = \ - torch.zeros(size=(training_window_size[0] * training_window_size[1] + 1,) * 2, - dtype=relative_coords.dtype) - relative_position_index[1:, 1:] = relative_coords.sum(-1) # Wh*Ww, Wh*Ww - relative_position_index[0, 0:] = new_num_relative_distance - 3 - relative_position_index[0:, 0] = new_num_relative_distance - 2 - relative_position_index[0, 0] = new_num_relative_distance - 1 - - relative_position_bias = \ - new_relative_position_bias_table[relative_position_index.view(-1)].view( - training_window_size[0] * training_window_size[1] + 1, - training_window_size[0] * training_window_size[1] + 1, -1) # Wh*Ww,Wh*Ww,nH - relative_position_bias = relative_position_bias.permute(2, 0, 1).contiguous() # nH, Wh*Ww, Wh*Ww - attn = attn + relative_position_bias.unsqueeze(0) - - if rel_pos_bias is not None: - attn = attn + rel_pos_bias - - attn = attn.softmax(dim=-1) - attn = self.attn_drop(attn) - - x = (attn @ v).transpose(1, 2).reshape(B, N, -1) - x = self.proj(x) - x = self.proj_drop(x) - return x - - -class Block(nn.Module): - - def __init__(self, dim, num_heads, mlp_ratio=4., qkv_bias=False, qk_scale=None, drop=0., attn_drop=0., - drop_path=0., init_values=None, act_layer=nn.GELU, norm_layer=nn.LayerNorm, - window_size=None, attn_head_dim=None): - super().__init__() - self.norm1 = norm_layer(dim) - self.attn = Attention( - dim, num_heads=num_heads, qkv_bias=qkv_bias, qk_scale=qk_scale, - attn_drop=attn_drop, proj_drop=drop, window_size=window_size, attn_head_dim=attn_head_dim) - # NOTE: drop path for stochastic depth, we shall see if this is better than dropout here - self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity() - self.norm2 = norm_layer(dim) - mlp_hidden_dim = int(dim * mlp_ratio) - self.mlp = Mlp(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop) - - if init_values is not None: - self.gamma_1 = nn.Parameter(init_values * torch.ones((dim)), requires_grad=True) - self.gamma_2 = nn.Parameter(init_values * torch.ones((dim)), requires_grad=True) - else: - self.gamma_1, self.gamma_2 = None, None - - def forward(self, x, rel_pos_bias=None, training_window_size=None): - if self.gamma_1 is None: - x = x + self.drop_path( - self.attn(self.norm1(x), rel_pos_bias=rel_pos_bias, training_window_size=training_window_size)) - x = x + self.drop_path(self.mlp(self.norm2(x))) - else: - x = x + self.drop_path(self.gamma_1 * self.attn(self.norm1(x), rel_pos_bias=rel_pos_bias, - training_window_size=training_window_size)) - x = x + self.drop_path(self.gamma_2 * self.mlp(self.norm2(x))) - return x - - -class PatchEmbed(nn.Module): - """ Image to Patch Embedding - """ - - def __init__(self, img_size=[224, 224], patch_size=16, in_chans=3, embed_dim=768): - super().__init__() - img_size = to_2tuple(img_size) - patch_size = to_2tuple(patch_size) - num_patches = (img_size[1] // patch_size[1]) * (img_size[0] // patch_size[0]) - self.patch_shape = (img_size[0] // patch_size[0], img_size[1] // patch_size[1]) - self.num_patches_w = self.patch_shape[0] - self.num_patches_h = self.patch_shape[1] - # the so-called patch_shape is the patch shape during pre-training - self.img_size = img_size - self.patch_size = patch_size - self.num_patches = num_patches - - self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_size, stride=patch_size) - - def forward(self, x, position_embedding=None, **kwargs): - # FIXME look at relaxing size constraints - # assert H == self.img_size[0] and W == self.img_size[1], \ - # f"Input image size ({H}*{W}) doesn't match model ({self.img_size[0]}*{self.img_size[1]})." - x = self.proj(x) - Hp, Wp = x.shape[2], x.shape[3] - - if position_embedding is not None: - # interpolate the position embedding to the corresponding size - position_embedding = position_embedding.view(1, self.patch_shape[0], self.patch_shape[1], -1).permute(0, 3, - 1, 2) - position_embedding = F.interpolate(position_embedding, size=(Hp, Wp), mode='bicubic') - x = x + position_embedding - - x = x.flatten(2).transpose(1, 2) - return x, (Hp, Wp) - - -class HybridEmbed(nn.Module): - """ CNN Feature Map Embedding - Extract feature map from CNN, flatten, project to embedding dim. - """ - - def __init__(self, backbone, img_size=[224, 224], feature_size=None, in_chans=3, embed_dim=768): - super().__init__() - assert isinstance(backbone, nn.Module) - img_size = to_2tuple(img_size) - self.img_size = img_size - self.backbone = backbone - if feature_size is None: - with torch.no_grad(): - # FIXME this is hacky, but most reliable way of determining the exact dim of the output feature - # map for all networks, the feature metadata has reliable channel and stride info, but using - # stride to calc feature dim requires info about padding of each stage that isn't captured. - training = backbone.training - if training: - backbone.eval() - o = self.backbone(torch.zeros(1, in_chans, img_size[0], img_size[1]))[-1] - feature_size = o.shape[-2:] - feature_dim = o.shape[1] - backbone.train(training) - else: - feature_size = to_2tuple(feature_size) - feature_dim = self.backbone.feature_info.channels()[-1] - self.num_patches = feature_size[0] * feature_size[1] - self.proj = nn.Linear(feature_dim, embed_dim) - - def forward(self, x): - x = self.backbone(x)[-1] - x = x.flatten(2).transpose(1, 2) - x = self.proj(x) - return x - - -class RelativePositionBias(nn.Module): - - def __init__(self, window_size, num_heads): - super().__init__() - self.window_size = window_size - self.num_heads = num_heads - self.num_relative_distance = (2 * window_size[0] - 1) * (2 * window_size[1] - 1) + 3 - self.relative_position_bias_table = nn.Parameter( - torch.zeros(self.num_relative_distance, num_heads)) # 2*Wh-1 * 2*Ww-1, nH - # cls to token & token 2 cls & cls to cls - - # get pair-wise relative position index for each token inside the window - coords_h = torch.arange(window_size[0]) - coords_w = torch.arange(window_size[1]) - coords = torch.stack(torch.meshgrid([coords_h, coords_w])) # 2, Wh, Ww - coords_flatten = torch.flatten(coords, 1) # 2, Wh*Ww - relative_coords = coords_flatten[:, :, None] - coords_flatten[:, None, :] # 2, Wh*Ww, Wh*Ww - relative_coords = relative_coords.permute(1, 2, 0).contiguous() # Wh*Ww, Wh*Ww, 2 - relative_coords[:, :, 0] += window_size[0] - 1 # shift to start from 0 - relative_coords[:, :, 1] += window_size[1] - 1 - relative_coords[:, :, 0] *= 2 * window_size[1] - 1 - relative_position_index = \ - torch.zeros(size=(window_size[0] * window_size[1] + 1,) * 2, dtype=relative_coords.dtype) - relative_position_index[1:, 1:] = relative_coords.sum(-1) # Wh*Ww, Wh*Ww - relative_position_index[0, 0:] = self.num_relative_distance - 3 - relative_position_index[0:, 0] = self.num_relative_distance - 2 - relative_position_index[0, 0] = self.num_relative_distance - 1 - - self.register_buffer("relative_position_index", relative_position_index) - - # trunc_normal_(self.relative_position_bias_table, std=.02) - - def forward(self, training_window_size): - if training_window_size == self.window_size: - relative_position_bias = \ - self.relative_position_bias_table[self.relative_position_index.view(-1)].view( - self.window_size[0] * self.window_size[1] + 1, - self.window_size[0] * self.window_size[1] + 1, -1) # Wh*Ww,Wh*Ww,nH - relative_position_bias = relative_position_bias.permute(2, 0, 1).contiguous() # nH, Wh*Ww, Wh*Ww - else: - training_window_size = tuple(training_window_size.tolist()) - new_num_relative_distance = (2 * training_window_size[0] - 1) * (2 * training_window_size[1] - 1) + 3 - # new_num_relative_dis 为 所有可能的相对位置选项,包含cls-cls,tok-cls,与cls-tok - new_relative_position_bias_table = F.interpolate( - self.relative_position_bias_table[:-3, :].permute(1, 0).view(1, self.num_heads, - 2 * self.window_size[0] - 1, - 2 * self.window_size[1] - 1), - size=(2 * training_window_size[0] - 1, 2 * training_window_size[1] - 1), mode='bicubic', - align_corners=False) - new_relative_position_bias_table = new_relative_position_bias_table.view(self.num_heads, - new_num_relative_distance - 3).permute( - 1, 0) - new_relative_position_bias_table = torch.cat( - [new_relative_position_bias_table, self.relative_position_bias_table[-3::]], dim=0) - - # get pair-wise relative position index for each token inside the window - coords_h = torch.arange(training_window_size[0]) - coords_w = torch.arange(training_window_size[1]) - coords = torch.stack(torch.meshgrid([coords_h, coords_w])) # 2, Wh, Ww - coords_flatten = torch.flatten(coords, 1) # 2, Wh*Ww - relative_coords = coords_flatten[:, :, None] - coords_flatten[:, None, :] # 2, Wh*Ww, Wh*Ww - relative_coords = relative_coords.permute(1, 2, 0).contiguous() # Wh*Ww, Wh*Ww, 2 - relative_coords[:, :, 0] += training_window_size[0] - 1 # shift to start from 0 - relative_coords[:, :, 1] += training_window_size[1] - 1 - relative_coords[:, :, 0] *= 2 * training_window_size[1] - 1 - relative_position_index = \ - torch.zeros(size=(training_window_size[0] * training_window_size[1] + 1,) * 2, - dtype=relative_coords.dtype) - relative_position_index[1:, 1:] = relative_coords.sum(-1) # Wh*Ww, Wh*Ww - relative_position_index[0, 0:] = new_num_relative_distance - 3 - relative_position_index[0:, 0] = new_num_relative_distance - 2 - relative_position_index[0, 0] = new_num_relative_distance - 1 - - relative_position_bias = \ - new_relative_position_bias_table[relative_position_index.view(-1)].view( - training_window_size[0] * training_window_size[1] + 1, - training_window_size[0] * training_window_size[1] + 1, -1) # Wh*Ww,Wh*Ww,nH - relative_position_bias = relative_position_bias.permute(2, 0, 1).contiguous() # nH, Wh*Ww, Wh*Ww - - return relative_position_bias - - -class BEiT(nn.Module): - """ Vision Transformer with support for patch or hybrid CNN input stage - """ - - def __init__(self, - img_size=[224, 224], - patch_size=16, - in_chans=3, - num_classes=80, - embed_dim=768, - depth=12, - num_heads=12, - mlp_ratio=4., - qkv_bias=False, - qk_scale=None, - drop_rate=0., - attn_drop_rate=0., - drop_path_rate=0., - hybrid_backbone=None, - norm_layer=None, - init_values=None, - use_abs_pos_emb=False, - use_rel_pos_bias=False, - use_shared_rel_pos_bias=False, - use_checkpoint=True, - pretrained=None, - out_features=None, - ): - - super(BEiT, self).__init__() - - norm_layer = norm_layer or partial(nn.LayerNorm, eps=1e-6) - self.num_classes = num_classes - self.num_features = self.embed_dim = embed_dim # num_features for consistency with other models - self.use_checkpoint = use_checkpoint - - if hybrid_backbone is not None: - self.patch_embed = HybridEmbed( - hybrid_backbone, img_size=img_size, in_chans=in_chans, embed_dim=embed_dim) - else: - self.patch_embed = PatchEmbed( - img_size=img_size, patch_size=patch_size, in_chans=in_chans, embed_dim=embed_dim) - num_patches = self.patch_embed.num_patches - self.out_features = out_features - self.out_indices = [int(name[5:]) for name in out_features] - - self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dim)) - # self.mask_token = nn.Parameter(torch.zeros(1, 1, embed_dim)) - if use_abs_pos_emb: - self.pos_embed = nn.Parameter(torch.zeros(1, num_patches + 1, embed_dim)) - else: - self.pos_embed = None - self.pos_drop = nn.Dropout(p=drop_rate) - - self.use_shared_rel_pos_bias = use_shared_rel_pos_bias - if use_shared_rel_pos_bias: - self.rel_pos_bias = RelativePositionBias(window_size=self.patch_embed.patch_shape, num_heads=num_heads) - else: - self.rel_pos_bias = None - - dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth)] # stochastic depth decay rule - self.use_rel_pos_bias = use_rel_pos_bias - self.blocks = nn.ModuleList([ - Block( - dim=embed_dim, num_heads=num_heads, mlp_ratio=mlp_ratio, qkv_bias=qkv_bias, qk_scale=qk_scale, - drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[i], norm_layer=norm_layer, - init_values=init_values, window_size=self.patch_embed.patch_shape if use_rel_pos_bias else None) - for i in range(depth)]) - - # trunc_normal_(self.mask_token, std=.02) - - if patch_size == 16: - self.fpn1 = nn.Sequential( - nn.ConvTranspose2d(embed_dim, embed_dim, kernel_size=2, stride=2), - # nn.SyncBatchNorm(embed_dim), - nn.BatchNorm2d(embed_dim), - nn.GELU(), - nn.ConvTranspose2d(embed_dim, embed_dim, kernel_size=2, stride=2), - ) - - self.fpn2 = nn.Sequential( - nn.ConvTranspose2d(embed_dim, embed_dim, kernel_size=2, stride=2), - ) - - self.fpn3 = nn.Identity() - - self.fpn4 = nn.MaxPool2d(kernel_size=2, stride=2) - elif patch_size == 8: - self.fpn1 = nn.Sequential( - nn.ConvTranspose2d(embed_dim, embed_dim, kernel_size=2, stride=2), - ) - - self.fpn2 = nn.Identity() - - self.fpn3 = nn.Sequential( - nn.MaxPool2d(kernel_size=2, stride=2), - ) - - self.fpn4 = nn.Sequential( - nn.MaxPool2d(kernel_size=4, stride=4), - ) - - if self.pos_embed is not None: - trunc_normal_(self.pos_embed, std=.02) - trunc_normal_(self.cls_token, std=.02) - self.apply(self._init_weights) - self.fix_init_weight() - - def fix_init_weight(self): - def rescale(param, layer_id): - param.div_(math.sqrt(2.0 * layer_id)) - - for layer_id, layer in enumerate(self.blocks): - rescale(layer.attn.proj.weight.data, layer_id + 1) - rescale(layer.mlp.fc2.weight.data, layer_id + 1) - - def _init_weights(self, m): - if isinstance(m, nn.Linear): - trunc_normal_(m.weight, std=.02) - if isinstance(m, nn.Linear) and m.bias is not None: - nn.init.constant_(m.bias, 0) - elif isinstance(m, nn.LayerNorm): - nn.init.constant_(m.bias, 0) - nn.init.constant_(m.weight, 1.0) - - ''' - def init_weights(self): - """Initialize the weights in backbone. - - Args: - pretrained (str, optional): Path to pre-trained weights. - Defaults to None. - """ - logger = get_root_logger() - - if self.pos_embed is not None: - trunc_normal_(self.pos_embed, std=.02) - trunc_normal_(self.cls_token, std=.02) - self.apply(self._init_weights) - self.fix_init_weight() - - if self.init_cfg is None: - logger.warn(f'No pre-trained weights for ' - f'{self.__class__.__name__}, ' - f'training start from scratch') - else: - assert 'checkpoint' in self.init_cfg, f'Only support ' \ - f'specify `Pretrained` in ' \ - f'`init_cfg` in ' \ - f'{self.__class__.__name__} ' - logger.info(f"Will load ckpt from {self.init_cfg['checkpoint']}") - load_checkpoint(self, - filename=self.init_cfg['checkpoint'], - strict=False, - logger=logger, - beit_spec_expand_rel_pos = self.use_rel_pos_bias, - ) - ''' - - def get_num_layers(self): - return len(self.blocks) - - @torch.jit.ignore - def no_weight_decay(self): - return {'pos_embed', 'cls_token'} - - def forward_features(self, x): - B, C, H, W = x.shape - x, (Hp, Wp) = self.patch_embed(x, self.pos_embed[:, 1:, :] if self.pos_embed is not None else None) - # Hp, Wp are HW for patches - batch_size, seq_len, _ = x.size() - - cls_tokens = self.cls_token.expand(batch_size, -1, -1) # stole cls_tokens impl from Phil Wang, thanks - if self.pos_embed is not None: - cls_tokens = cls_tokens + self.pos_embed[:, :1, :] - x = torch.cat((cls_tokens, x), dim=1) - x = self.pos_drop(x) - - features = [] - training_window_size = torch.tensor([Hp, Wp]) - - rel_pos_bias = self.rel_pos_bias(training_window_size) if self.rel_pos_bias is not None else None - - for i, blk in enumerate(self.blocks): - if self.use_checkpoint: - x = checkpoint.checkpoint(blk, x, rel_pos_bias, training_window_size) - else: - x = blk(x, rel_pos_bias=rel_pos_bias, training_window_size=training_window_size) - if i in self.out_indices: - xp = x[:, 1:, :].permute(0, 2, 1).reshape(B, -1, Hp, Wp) - features.append(xp.contiguous()) - - ops = [self.fpn1, self.fpn2, self.fpn3, self.fpn4] - for i in range(len(features)): - features[i] = ops[i](features[i]) - - feat_out = {} - - for name, value in zip(self.out_features, features): - feat_out[name] = value - - return feat_out - - def forward(self, x): - x = self.forward_features(x) - return x - - -def beit_base_patch16(pretrained=False, **kwargs): - model = BEiT( - patch_size=16, - embed_dim=768, - depth=12, - num_heads=12, - mlp_ratio=4, - qkv_bias=True, - norm_layer=partial(nn.LayerNorm, eps=1e-6), - init_values=None, - **kwargs) - model.default_cfg = _cfg() - return model - -def beit_large_patch16(pretrained=False, **kwargs): - model = BEiT( - patch_size=16, - embed_dim=1024, - depth=24, - num_heads=16, - mlp_ratio=4, - qkv_bias=True, - norm_layer=partial(nn.LayerNorm, eps=1e-6), - init_values=None, - **kwargs) - model.default_cfg = _cfg() - return model - -def dit_base_patch16(pretrained=False, **kwargs): - model = BEiT( - patch_size=16, - embed_dim=768, - depth=12, - num_heads=12, - mlp_ratio=4, - qkv_bias=True, - norm_layer=partial(nn.LayerNorm, eps=1e-6), - init_values=0.1, - **kwargs) - model.default_cfg = _cfg() - return model - -def dit_large_patch16(pretrained=False, **kwargs): - model = BEiT( - patch_size=16, - embed_dim=1024, - depth=24, - num_heads=16, - mlp_ratio=4, - qkv_bias=True, - norm_layer=partial(nn.LayerNorm, eps=1e-6), - init_values=1e-5, - **kwargs) - model.default_cfg = _cfg() - return model - -if __name__ == '__main__': - model = BEiT(use_checkpoint=True, use_shared_rel_pos_bias=True) - model = model.to("cuda:0") - input1 = torch.rand(2, 3, 512, 762).to("cuda:0") - input2 = torch.rand(2, 3, 800, 1200).to("cuda:0") - input3 = torch.rand(2, 3, 720, 1000).to("cuda:0") - output1 = model(input1) - output2 = model(input2) - output3 = model(input3) - print("all done") diff --git a/magic_pdf/model/sub_modules/layout/layoutlmv3/deit.py b/magic_pdf/model/sub_modules/layout/layoutlmv3/deit.py deleted file mode 100644 index 9a13bb0a8514df29fb4b0ec58c3726ba9c221a8a..0000000000000000000000000000000000000000 --- a/magic_pdf/model/sub_modules/layout/layoutlmv3/deit.py +++ /dev/null @@ -1,476 +0,0 @@ -""" -Mostly copy-paste from DINO and timm library: -https://github.com/facebookresearch/dino -https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/vision_transformer.py -""" -import warnings - -import math -import torch -import torch.nn as nn -import torch.utils.checkpoint as checkpoint -from timm.models.layers import trunc_normal_, drop_path, to_2tuple -from functools import partial - -def _cfg(url='', **kwargs): - return { - 'url': url, - 'num_classes': 1000, 'input_size': (3, 224, 224), 'pool_size': None, - 'crop_pct': .9, 'interpolation': 'bicubic', - 'mean': (0.5, 0.5, 0.5), 'std': (0.5, 0.5, 0.5), - **kwargs - } - -class DropPath(nn.Module): - """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks). - """ - - def __init__(self, drop_prob=None): - super(DropPath, self).__init__() - self.drop_prob = drop_prob - - def forward(self, x): - return drop_path(x, self.drop_prob, self.training) - - def extra_repr(self) -> str: - return 'p={}'.format(self.drop_prob) - - -class Mlp(nn.Module): - def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, drop=0.): - super().__init__() - out_features = out_features or in_features - hidden_features = hidden_features or in_features - self.fc1 = nn.Linear(in_features, hidden_features) - self.act = act_layer() - self.fc2 = nn.Linear(hidden_features, out_features) - self.drop = nn.Dropout(drop) - - def forward(self, x): - x = self.fc1(x) - x = self.act(x) - x = self.drop(x) - x = self.fc2(x) - x = self.drop(x) - return x - - -class Attention(nn.Module): - def __init__(self, dim, num_heads=8, qkv_bias=False, qk_scale=None, attn_drop=0., proj_drop=0.): - super().__init__() - self.num_heads = num_heads - head_dim = dim // num_heads - # NOTE scale factor was wrong in my original version, can set manually to be compat with prev weights - self.scale = qk_scale or head_dim ** -0.5 - - self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias) - self.attn_drop = nn.Dropout(attn_drop) - self.proj = nn.Linear(dim, dim) - self.proj_drop = nn.Dropout(proj_drop) - - def forward(self, x): - B, N, C = x.shape - q, k, v = self.qkv(x).reshape(B, N, 3, self.num_heads, - C // self.num_heads).permute(2, 0, 3, 1, 4) - - attn = (q @ k.transpose(-2, -1)) * self.scale - attn = attn.softmax(dim=-1) - attn = self.attn_drop(attn) - - x = (attn @ v).transpose(1, 2).reshape(B, N, C) - x = self.proj(x) - x = self.proj_drop(x) - return x - - -class Block(nn.Module): - - def __init__(self, dim, num_heads, mlp_ratio=4., qkv_bias=False, qk_scale=None, drop=0., attn_drop=0., - drop_path=0., act_layer=nn.GELU, norm_layer=nn.LayerNorm): - super().__init__() - self.norm1 = norm_layer(dim) - self.attn = Attention( - dim, num_heads=num_heads, qkv_bias=qkv_bias, qk_scale=qk_scale, attn_drop=attn_drop, proj_drop=drop) - # NOTE: drop path for stochastic depth, we shall see if this is better than dropout here - self.drop_path = DropPath( - drop_path) if drop_path > 0. else nn.Identity() - self.norm2 = norm_layer(dim) - mlp_hidden_dim = int(dim * mlp_ratio) - self.mlp = Mlp(in_features=dim, hidden_features=mlp_hidden_dim, - act_layer=act_layer, drop=drop) - - def forward(self, x): - x = x + self.drop_path(self.attn(self.norm1(x))) - x = x + self.drop_path(self.mlp(self.norm2(x))) - return x - - -class PatchEmbed(nn.Module): - """ Image to Patch Embedding - """ - - def __init__(self, img_size=224, patch_size=16, in_chans=3, embed_dim=768): - super().__init__() - img_size = to_2tuple(img_size) - patch_size = to_2tuple(patch_size) - - self.window_size = (img_size[0] // patch_size[0], img_size[1] // patch_size[1]) - - self.num_patches_w, self.num_patches_h = self.window_size - - self.num_patches = self.window_size[0] * self.window_size[1] - self.img_size = img_size - self.patch_size = patch_size - - self.proj = nn.Conv2d(in_chans, embed_dim, - kernel_size=patch_size, stride=patch_size) - - def forward(self, x): - x = self.proj(x) - return x - - -class HybridEmbed(nn.Module): - """ CNN Feature Map Embedding - Extract feature map from CNN, flatten, project to embedding dim. - """ - - def __init__(self, backbone, img_size=224, feature_size=None, in_chans=3, embed_dim=768): - super().__init__() - assert isinstance(backbone, nn.Module) - img_size = to_2tuple(img_size) - self.img_size = img_size - self.backbone = backbone - if feature_size is None: - with torch.no_grad(): - # FIXME this is hacky, but most reliable way of determining the exact dim of the output feature - # map for all networks, the feature metadata has reliable channel and stride info, but using - # stride to calc feature dim requires info about padding of each stage that isn't captured. - training = backbone.training - if training: - backbone.eval() - o = self.backbone(torch.zeros( - 1, in_chans, img_size[0], img_size[1]))[-1] - feature_size = o.shape[-2:] - feature_dim = o.shape[1] - backbone.train(training) - else: - feature_size = to_2tuple(feature_size) - feature_dim = self.backbone.feature_info.channels()[-1] - self.num_patches = feature_size[0] * feature_size[1] - self.proj = nn.Linear(feature_dim, embed_dim) - - def forward(self, x): - x = self.backbone(x)[-1] - x = x.flatten(2).transpose(1, 2) - x = self.proj(x) - return x - - -class ViT(nn.Module): - """ Vision Transformer with support for patch or hybrid CNN input stage - """ - - def __init__(self, - model_name='vit_base_patch16_224', - img_size=384, - patch_size=16, - in_chans=3, - embed_dim=1024, - depth=24, - num_heads=16, - num_classes=19, - mlp_ratio=4., - qkv_bias=True, - qk_scale=None, - drop_rate=0.1, - attn_drop_rate=0., - drop_path_rate=0., - hybrid_backbone=None, - norm_layer=partial(nn.LayerNorm, eps=1e-6), - norm_cfg=None, - pos_embed_interp=False, - random_init=False, - align_corners=False, - use_checkpoint=False, - num_extra_tokens=1, - out_features=None, - **kwargs, - ): - - super(ViT, self).__init__() - self.model_name = model_name - self.img_size = img_size - self.patch_size = patch_size - self.in_chans = in_chans - self.embed_dim = embed_dim - self.depth = depth - self.num_heads = num_heads - self.num_classes = num_classes - self.mlp_ratio = mlp_ratio - self.qkv_bias = qkv_bias - self.qk_scale = qk_scale - self.drop_rate = drop_rate - self.attn_drop_rate = attn_drop_rate - self.drop_path_rate = drop_path_rate - self.hybrid_backbone = hybrid_backbone - self.norm_layer = norm_layer - self.norm_cfg = norm_cfg - self.pos_embed_interp = pos_embed_interp - self.random_init = random_init - self.align_corners = align_corners - self.use_checkpoint = use_checkpoint - self.num_extra_tokens = num_extra_tokens - self.out_features = out_features - self.out_indices = [int(name[5:]) for name in out_features] - - # self.num_stages = self.depth - # self.out_indices = tuple(range(self.num_stages)) - - if self.hybrid_backbone is not None: - self.patch_embed = HybridEmbed( - self.hybrid_backbone, img_size=self.img_size, in_chans=self.in_chans, embed_dim=self.embed_dim) - else: - self.patch_embed = PatchEmbed( - img_size=self.img_size, patch_size=self.patch_size, in_chans=self.in_chans, embed_dim=self.embed_dim) - self.num_patches = self.patch_embed.num_patches - - self.cls_token = nn.Parameter(torch.zeros(1, 1, self.embed_dim)) - - if self.num_extra_tokens == 2: - self.dist_token = nn.Parameter(torch.zeros(1, 1, self.embed_dim)) - - self.pos_embed = nn.Parameter(torch.zeros( - 1, self.num_patches + self.num_extra_tokens, self.embed_dim)) - self.pos_drop = nn.Dropout(p=self.drop_rate) - - # self.num_extra_tokens = self.pos_embed.shape[-2] - self.num_patches - dpr = [x.item() for x in torch.linspace(0, self.drop_path_rate, - self.depth)] # stochastic depth decay rule - self.blocks = nn.ModuleList([ - Block( - dim=self.embed_dim, num_heads=self.num_heads, mlp_ratio=self.mlp_ratio, qkv_bias=self.qkv_bias, - qk_scale=self.qk_scale, - drop=self.drop_rate, attn_drop=self.attn_drop_rate, drop_path=dpr[i], norm_layer=self.norm_layer) - for i in range(self.depth)]) - - # NOTE as per official impl, we could have a pre-logits representation dense layer + tanh here - # self.repr = nn.Linear(embed_dim, representation_size) - # self.repr_act = nn.Tanh() - - if patch_size == 16: - self.fpn1 = nn.Sequential( - nn.ConvTranspose2d(embed_dim, embed_dim, kernel_size=2, stride=2), - nn.SyncBatchNorm(embed_dim), - nn.GELU(), - nn.ConvTranspose2d(embed_dim, embed_dim, kernel_size=2, stride=2), - ) - - self.fpn2 = nn.Sequential( - nn.ConvTranspose2d(embed_dim, embed_dim, kernel_size=2, stride=2), - ) - - self.fpn3 = nn.Identity() - - self.fpn4 = nn.MaxPool2d(kernel_size=2, stride=2) - elif patch_size == 8: - self.fpn1 = nn.Sequential( - nn.ConvTranspose2d(embed_dim, embed_dim, kernel_size=2, stride=2), - ) - - self.fpn2 = nn.Identity() - - self.fpn3 = nn.Sequential( - nn.MaxPool2d(kernel_size=2, stride=2), - ) - - self.fpn4 = nn.Sequential( - nn.MaxPool2d(kernel_size=4, stride=4), - ) - - trunc_normal_(self.pos_embed, std=.02) - trunc_normal_(self.cls_token, std=.02) - if self.num_extra_tokens==2: - trunc_normal_(self.dist_token, std=0.2) - self.apply(self._init_weights) - # self.fix_init_weight() - - def fix_init_weight(self): - def rescale(param, layer_id): - param.div_(math.sqrt(2.0 * layer_id)) - - for layer_id, layer in enumerate(self.blocks): - rescale(layer.attn.proj.weight.data, layer_id + 1) - rescale(layer.mlp.fc2.weight.data, layer_id + 1) - - def _init_weights(self, m): - if isinstance(m, nn.Linear): - trunc_normal_(m.weight, std=.02) - if isinstance(m, nn.Linear) and m.bias is not None: - nn.init.constant_(m.bias, 0) - elif isinstance(m, nn.LayerNorm): - nn.init.constant_(m.bias, 0) - nn.init.constant_(m.weight, 1.0) - - ''' - def init_weights(self): - logger = get_root_logger() - - trunc_normal_(self.pos_embed, std=.02) - trunc_normal_(self.cls_token, std=.02) - self.apply(self._init_weights) - - if self.init_cfg is None: - logger.warn(f'No pre-trained weights for ' - f'{self.__class__.__name__}, ' - f'training start from scratch') - else: - assert 'checkpoint' in self.init_cfg, f'Only support ' \ - f'specify `Pretrained` in ' \ - f'`init_cfg` in ' \ - f'{self.__class__.__name__} ' - logger.info(f"Will load ckpt from {self.init_cfg['checkpoint']}") - load_checkpoint(self, filename=self.init_cfg['checkpoint'], strict=False, logger=logger) - ''' - - def get_num_layers(self): - return len(self.blocks) - - @torch.jit.ignore - def no_weight_decay(self): - return {'pos_embed', 'cls_token'} - - def _conv_filter(self, state_dict, patch_size=16): - """ convert patch embedding weight from manual patchify + linear proj to conv""" - out_dict = {} - for k, v in state_dict.items(): - if 'patch_embed.proj.weight' in k: - v = v.reshape((v.shape[0], 3, patch_size, patch_size)) - out_dict[k] = v - return out_dict - - def to_2D(self, x): - n, hw, c = x.shape - h = w = int(math.sqrt(hw)) - x = x.transpose(1, 2).reshape(n, c, h, w) - return x - - def to_1D(self, x): - n, c, h, w = x.shape - x = x.reshape(n, c, -1).transpose(1, 2) - return x - - def interpolate_pos_encoding(self, x, w, h): - npatch = x.shape[1] - self.num_extra_tokens - N = self.pos_embed.shape[1] - self.num_extra_tokens - if npatch == N and w == h: - return self.pos_embed - - class_ORdist_pos_embed = self.pos_embed[:, 0:self.num_extra_tokens] - - patch_pos_embed = self.pos_embed[:, self.num_extra_tokens:] - - dim = x.shape[-1] - w0 = w // self.patch_embed.patch_size[0] - h0 = h // self.patch_embed.patch_size[1] - # we add a small number to avoid floating point error in the interpolation - # see discussion at https://github.com/facebookresearch/dino/issues/8 - w0, h0 = w0 + 0.1, h0 + 0.1 - patch_pos_embed = nn.functional.interpolate( - patch_pos_embed.reshape(1, int(math.sqrt(N)), int(math.sqrt(N)), dim).permute(0, 3, 1, 2), - scale_factor=(w0 / math.sqrt(N), h0 / math.sqrt(N)), - mode='bicubic', - ) - assert int(w0) == patch_pos_embed.shape[-2] and int(h0) == patch_pos_embed.shape[-1] - patch_pos_embed = patch_pos_embed.permute(0, 2, 3, 1).view(1, -1, dim) - - return torch.cat((class_ORdist_pos_embed, patch_pos_embed), dim=1) - - def prepare_tokens(self, x, mask=None): - B, nc, w, h = x.shape - # patch linear embedding - x = self.patch_embed(x) - - # mask image modeling - if mask is not None: - x = self.mask_model(x, mask) - x = x.flatten(2).transpose(1, 2) - - # add the [CLS] token to the embed patch tokens - all_tokens = [self.cls_token.expand(B, -1, -1)] - - if self.num_extra_tokens == 2: - dist_tokens = self.dist_token.expand(B, -1, -1) - all_tokens.append(dist_tokens) - all_tokens.append(x) - - x = torch.cat(all_tokens, dim=1) - - # add positional encoding to each token - x = x + self.interpolate_pos_encoding(x, w, h) - - return self.pos_drop(x) - - def forward_features(self, x): - # print(f"==========shape of x is {x.shape}==========") - B, _, H, W = x.shape - Hp, Wp = H // self.patch_size, W // self.patch_size - x = self.prepare_tokens(x) - - features = [] - for i, blk in enumerate(self.blocks): - if self.use_checkpoint: - x = checkpoint.checkpoint(blk, x) - else: - x = blk(x) - if i in self.out_indices: - xp = x[:, self.num_extra_tokens:, :].permute(0, 2, 1).reshape(B, -1, Hp, Wp) - features.append(xp.contiguous()) - - ops = [self.fpn1, self.fpn2, self.fpn3, self.fpn4] - for i in range(len(features)): - features[i] = ops[i](features[i]) - - feat_out = {} - - for name, value in zip(self.out_features, features): - feat_out[name] = value - - return feat_out - - def forward(self, x): - x = self.forward_features(x) - return x - - -def deit_base_patch16(pretrained=False, **kwargs): - model = ViT( - patch_size=16, - drop_rate=0., - embed_dim=768, - depth=12, - num_heads=12, - num_classes=1000, - mlp_ratio=4., - qkv_bias=True, - use_checkpoint=True, - num_extra_tokens=2, - **kwargs) - model.default_cfg = _cfg() - return model - -def mae_base_patch16(pretrained=False, **kwargs): - model = ViT( - patch_size=16, - drop_rate=0., - embed_dim=768, - depth=12, - num_heads=12, - num_classes=1000, - mlp_ratio=4., - qkv_bias=True, - use_checkpoint=True, - num_extra_tokens=1, - **kwargs) - model.default_cfg = _cfg() - return model \ No newline at end of file diff --git a/magic_pdf/model/sub_modules/layout/layoutlmv3/layoutlmft/__init__.py b/magic_pdf/model/sub_modules/layout/layoutlmv3/layoutlmft/__init__.py deleted file mode 100644 index cd997b55f3118a01f5d49ae2f080525c7d7c9534..0000000000000000000000000000000000000000 --- a/magic_pdf/model/sub_modules/layout/layoutlmv3/layoutlmft/__init__.py +++ /dev/null @@ -1,7 +0,0 @@ -from .models import ( - LayoutLMv3Config, - LayoutLMv3ForTokenClassification, - LayoutLMv3ForQuestionAnswering, - LayoutLMv3ForSequenceClassification, - LayoutLMv3Tokenizer, -) diff --git a/magic_pdf/model/sub_modules/layout/layoutlmv3/layoutlmft/data/__init__.py b/magic_pdf/model/sub_modules/layout/layoutlmv3/layoutlmft/data/__init__.py deleted file mode 100644 index 5bcec6c7c65b7add5c3440f106b8f1049781167a..0000000000000000000000000000000000000000 --- a/magic_pdf/model/sub_modules/layout/layoutlmv3/layoutlmft/data/__init__.py +++ /dev/null @@ -1,2 +0,0 @@ -# flake8: noqa -from .data_collator import DataCollatorForKeyValueExtraction diff --git a/magic_pdf/model/sub_modules/layout/layoutlmv3/layoutlmft/data/cord.py b/magic_pdf/model/sub_modules/layout/layoutlmv3/layoutlmft/data/cord.py deleted file mode 100644 index 820dc53a4ea8bc79ddac2d36b57ea2110e8d27d5..0000000000000000000000000000000000000000 --- a/magic_pdf/model/sub_modules/layout/layoutlmv3/layoutlmft/data/cord.py +++ /dev/null @@ -1,171 +0,0 @@ -''' -Reference: https://huggingface.co/datasets/pierresi/cord/blob/main/cord.py -''' - - -import json -import os -from pathlib import Path -import datasets -from .image_utils import load_image, normalize_bbox -logger = datasets.logging.get_logger(__name__) -_CITATION = """\ -@article{park2019cord, - title={CORD: A Consolidated Receipt Dataset for Post-OCR Parsing}, - author={Park, Seunghyun and Shin, Seung and Lee, Bado and Lee, Junyeop and Surh, Jaeheung and Seo, Minjoon and Lee, Hwalsuk} - booktitle={Document Intelligence Workshop at Neural Information Processing Systems} - year={2019} -} -""" -_DESCRIPTION = """\ -https://github.com/clovaai/cord/ -""" - -def quad_to_box(quad): - # test 87 is wrongly annotated - box = ( - max(0, quad["x1"]), - max(0, quad["y1"]), - quad["x3"], - quad["y3"] - ) - if box[3] < box[1]: - bbox = list(box) - tmp = bbox[3] - bbox[3] = bbox[1] - bbox[1] = tmp - box = tuple(bbox) - if box[2] < box[0]: - bbox = list(box) - tmp = bbox[2] - bbox[2] = bbox[0] - bbox[0] = tmp - box = tuple(bbox) - return box - -def _get_drive_url(url): - base_url = 'https://drive.google.com/uc?id=' - split_url = url.split('/') - return base_url + split_url[5] - -_URLS = [ - _get_drive_url("https://drive.google.com/file/d/1MqhTbcj-AHXOqYoeoh12aRUwIprzTJYI/"), - _get_drive_url("https://drive.google.com/file/d/1wYdp5nC9LnHQZ2FcmOoC0eClyWvcuARU/") - # If you failed to download the dataset through the automatic downloader, - # you can download it manually and modify the code to get the local dataset. - # Or you can use the following links. Please follow the original LICENSE of CORD for usage. - # "https://layoutlm.blob.core.windows.net/cord/CORD-1k-001.zip", - # "https://layoutlm.blob.core.windows.net/cord/CORD-1k-002.zip" -] - -class CordConfig(datasets.BuilderConfig): - """BuilderConfig for CORD""" - def __init__(self, **kwargs): - """BuilderConfig for CORD. - Args: - **kwargs: keyword arguments forwarded to super. - """ - super(CordConfig, self).__init__(**kwargs) - -class Cord(datasets.GeneratorBasedBuilder): - BUILDER_CONFIGS = [ - CordConfig(name="cord", version=datasets.Version("1.0.0"), description="CORD dataset"), - ] - - def _info(self): - return datasets.DatasetInfo( - description=_DESCRIPTION, - features=datasets.Features( - { - "id": datasets.Value("string"), - "words": datasets.Sequence(datasets.Value("string")), - "bboxes": datasets.Sequence(datasets.Sequence(datasets.Value("int64"))), - "ner_tags": datasets.Sequence( - datasets.features.ClassLabel( - names=["O","B-MENU.NM","B-MENU.NUM","B-MENU.UNITPRICE","B-MENU.CNT","B-MENU.DISCOUNTPRICE","B-MENU.PRICE","B-MENU.ITEMSUBTOTAL","B-MENU.VATYN","B-MENU.ETC","B-MENU.SUB_NM","B-MENU.SUB_UNITPRICE","B-MENU.SUB_CNT","B-MENU.SUB_PRICE","B-MENU.SUB_ETC","B-VOID_MENU.NM","B-VOID_MENU.PRICE","B-SUB_TOTAL.SUBTOTAL_PRICE","B-SUB_TOTAL.DISCOUNT_PRICE","B-SUB_TOTAL.SERVICE_PRICE","B-SUB_TOTAL.OTHERSVC_PRICE","B-SUB_TOTAL.TAX_PRICE","B-SUB_TOTAL.ETC","B-TOTAL.TOTAL_PRICE","B-TOTAL.TOTAL_ETC","B-TOTAL.CASHPRICE","B-TOTAL.CHANGEPRICE","B-TOTAL.CREDITCARDPRICE","B-TOTAL.EMONEYPRICE","B-TOTAL.MENUTYPE_CNT","B-TOTAL.MENUQTY_CNT","I-MENU.NM","I-MENU.NUM","I-MENU.UNITPRICE","I-MENU.CNT","I-MENU.DISCOUNTPRICE","I-MENU.PRICE","I-MENU.ITEMSUBTOTAL","I-MENU.VATYN","I-MENU.ETC","I-MENU.SUB_NM","I-MENU.SUB_UNITPRICE","I-MENU.SUB_CNT","I-MENU.SUB_PRICE","I-MENU.SUB_ETC","I-VOID_MENU.NM","I-VOID_MENU.PRICE","I-SUB_TOTAL.SUBTOTAL_PRICE","I-SUB_TOTAL.DISCOUNT_PRICE","I-SUB_TOTAL.SERVICE_PRICE","I-SUB_TOTAL.OTHERSVC_PRICE","I-SUB_TOTAL.TAX_PRICE","I-SUB_TOTAL.ETC","I-TOTAL.TOTAL_PRICE","I-TOTAL.TOTAL_ETC","I-TOTAL.CASHPRICE","I-TOTAL.CHANGEPRICE","I-TOTAL.CREDITCARDPRICE","I-TOTAL.EMONEYPRICE","I-TOTAL.MENUTYPE_CNT","I-TOTAL.MENUQTY_CNT"] - ) - ), - "image": datasets.Array3D(shape=(3, 224, 224), dtype="uint8"), - "image_path": datasets.Value("string"), - } - ), - supervised_keys=None, - citation=_CITATION, - homepage="https://github.com/clovaai/cord/", - ) - - def _split_generators(self, dl_manager): - """Returns SplitGenerators.""" - """Uses local files located with data_dir""" - downloaded_file = dl_manager.download_and_extract(_URLS) - # move files from the second URL together with files from the first one. - dest = Path(downloaded_file[0])/"CORD" - for split in ["train", "dev", "test"]: - for file_type in ["image", "json"]: - if split == "test" and file_type == "json": - continue - files = (Path(downloaded_file[1])/"CORD"/split/file_type).iterdir() - for f in files: - os.rename(f, dest/split/file_type/f.name) - return [ - datasets.SplitGenerator( - name=datasets.Split.TRAIN, gen_kwargs={"filepath": dest/"train"} - ), - datasets.SplitGenerator( - name=datasets.Split.VALIDATION, gen_kwargs={"filepath": dest/"dev"} - ), - datasets.SplitGenerator( - name=datasets.Split.TEST, gen_kwargs={"filepath": dest/"test"} - ), - ] - - def get_line_bbox(self, bboxs): - x = [bboxs[i][j] for i in range(len(bboxs)) for j in range(0, len(bboxs[i]), 2)] - y = [bboxs[i][j] for i in range(len(bboxs)) for j in range(1, len(bboxs[i]), 2)] - - x0, y0, x1, y1 = min(x), min(y), max(x), max(y) - - assert x1 >= x0 and y1 >= y0 - bbox = [[x0, y0, x1, y1] for _ in range(len(bboxs))] - return bbox - - def _generate_examples(self, filepath): - logger.info("⏳ Generating examples from = %s", filepath) - ann_dir = os.path.join(filepath, "json") - img_dir = os.path.join(filepath, "image") - for guid, file in enumerate(sorted(os.listdir(ann_dir))): - words = [] - bboxes = [] - ner_tags = [] - file_path = os.path.join(ann_dir, file) - with open(file_path, "r", encoding="utf8") as f: - data = json.load(f) - image_path = os.path.join(img_dir, file) - image_path = image_path.replace("json", "png") - image, size = load_image(image_path) - for item in data["valid_line"]: - cur_line_bboxes = [] - line_words, label = item["words"], item["category"] - line_words = [w for w in line_words if w["text"].strip() != ""] - if len(line_words) == 0: - continue - if label == "other": - for w in line_words: - words.append(w["text"]) - ner_tags.append("O") - cur_line_bboxes.append(normalize_bbox(quad_to_box(w["quad"]), size)) - else: - words.append(line_words[0]["text"]) - ner_tags.append("B-" + label.upper()) - cur_line_bboxes.append(normalize_bbox(quad_to_box(line_words[0]["quad"]), size)) - for w in line_words[1:]: - words.append(w["text"]) - ner_tags.append("I-" + label.upper()) - cur_line_bboxes.append(normalize_bbox(quad_to_box(w["quad"]), size)) - # by default: --segment_level_layout 1 - # if do not want to use segment_level_layout, comment the following line - cur_line_bboxes = self.get_line_bbox(cur_line_bboxes) - bboxes.extend(cur_line_bboxes) - # yield guid, {"id": str(guid), "words": words, "bboxes": bboxes, "ner_tags": ner_tags, "image": image} - yield guid, {"id": str(guid), "words": words, "bboxes": bboxes, "ner_tags": ner_tags, - "image": image, "image_path": image_path} diff --git a/magic_pdf/model/sub_modules/layout/layoutlmv3/layoutlmft/data/data_collator.py b/magic_pdf/model/sub_modules/layout/layoutlmv3/layoutlmft/data/data_collator.py deleted file mode 100644 index 4232a6660ba2678ba20be5479629550419a798b4..0000000000000000000000000000000000000000 --- a/magic_pdf/model/sub_modules/layout/layoutlmv3/layoutlmft/data/data_collator.py +++ /dev/null @@ -1,124 +0,0 @@ -import torch -from dataclasses import dataclass -from typing import Any, Dict, List, Optional, Tuple, Union - -from transformers import BatchEncoding, PreTrainedTokenizerBase -from transformers.data.data_collator import ( - DataCollatorMixin, - _torch_collate_batch, -) -from transformers.file_utils import PaddingStrategy - -from typing import NewType -InputDataClass = NewType("InputDataClass", Any) - -def pre_calc_rel_mat(segment_ids): - valid_span = torch.zeros((segment_ids.shape[0], segment_ids.shape[1], segment_ids.shape[1]), - device=segment_ids.device, dtype=torch.bool) - for i in range(segment_ids.shape[0]): - for j in range(segment_ids.shape[1]): - valid_span[i, j, :] = segment_ids[i, :] == segment_ids[i, j] - - return valid_span - -@dataclass -class DataCollatorForKeyValueExtraction(DataCollatorMixin): - """ - Data collator that will dynamically pad the inputs received, as well as the labels. - Args: - tokenizer (:class:`~transformers.PreTrainedTokenizer` or :class:`~transformers.PreTrainedTokenizerFast`): - The tokenizer used for encoding the data. - padding (:obj:`bool`, :obj:`str` or :class:`~transformers.file_utils.PaddingStrategy`, `optional`, defaults to :obj:`True`): - Select a strategy to pad the returned sequences (according to the model's padding side and padding index) - among: - * :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a single - sequence if provided). - * :obj:`'max_length'`: Pad to a maximum length specified with the argument :obj:`max_length` or to the - maximum acceptable input length for the model if that argument is not provided. - * :obj:`False` or :obj:`'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of - different lengths). - max_length (:obj:`int`, `optional`): - Maximum length of the returned list and optionally padding length (see above). - pad_to_multiple_of (:obj:`int`, `optional`): - If set will pad the sequence to a multiple of the provided value. - This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability >= - 7.5 (Volta). - label_pad_token_id (:obj:`int`, `optional`, defaults to -100): - The id to use when padding the labels (-100 will be automatically ignore by PyTorch loss functions). - """ - - tokenizer: PreTrainedTokenizerBase - padding: Union[bool, str, PaddingStrategy] = True - max_length: Optional[int] = None - pad_to_multiple_of: Optional[int] = None - label_pad_token_id: int = -100 - - def __call__(self, features): - label_name = "label" if "label" in features[0].keys() else "labels" - labels = [feature[label_name] for feature in features] if label_name in features[0].keys() else None - - images = None - if "images" in features[0]: - images = torch.stack([torch.tensor(d.pop("images")) for d in features]) - IMAGE_LEN = int(images.shape[-1] / 16) * int(images.shape[-1] / 16) + 1 - - batch = self.tokenizer.pad( - features, - padding=self.padding, - max_length=self.max_length, - pad_to_multiple_of=self.pad_to_multiple_of, - # Conversion to tensors will fail if we have labels as they are not of the same length yet. - return_tensors="pt" if labels is None else None, - ) - - if images is not None: - batch["images"] = images - batch = {k: torch.tensor(v, dtype=torch.int64) if isinstance(v[0], list) and k == 'attention_mask' else v - for k, v in batch.items()} - visual_attention_mask = torch.ones((len(batch['input_ids']), IMAGE_LEN), dtype=torch.long) - batch["attention_mask"] = torch.cat([batch['attention_mask'], visual_attention_mask], dim=1) - - if labels is None: - return batch - - has_bbox_input = "bbox" in features[0] - has_position_input = "position_ids" in features[0] - padding_idx=self.tokenizer.pad_token_id - sequence_length = torch.tensor(batch["input_ids"]).shape[1] - padding_side = self.tokenizer.padding_side - if padding_side == "right": - batch["labels"] = [label + [self.label_pad_token_id] * (sequence_length - len(label)) for label in labels] - if has_bbox_input: - batch["bbox"] = [bbox + [[0, 0, 0, 0]] * (sequence_length - len(bbox)) for bbox in batch["bbox"]] - if has_position_input: - batch["position_ids"] = [position_id + [padding_idx] * (sequence_length - len(position_id)) - for position_id in batch["position_ids"]] - - else: - batch["labels"] = [[self.label_pad_token_id] * (sequence_length - len(label)) + label for label in labels] - if has_bbox_input: - batch["bbox"] = [[[0, 0, 0, 0]] * (sequence_length - len(bbox)) + bbox for bbox in batch["bbox"]] - if has_position_input: - batch["position_ids"] = [[padding_idx] * (sequence_length - len(position_id)) - + position_id for position_id in batch["position_ids"]] - - if 'segment_ids' in batch: - assert 'position_ids' in batch - for i in range(len(batch['segment_ids'])): - batch['segment_ids'][i] = batch['segment_ids'][i] + [batch['segment_ids'][i][-1] + 1] * (sequence_length - len(batch['segment_ids'][i])) + [ - batch['segment_ids'][i][-1] + 2] * IMAGE_LEN - - batch = {k: torch.tensor(v, dtype=torch.int64) if isinstance(v[0], list) else v for k, v in batch.items()} - - if 'segment_ids' in batch: - valid_span = pre_calc_rel_mat( - segment_ids=batch['segment_ids'] - ) - batch['valid_span'] = valid_span - del batch['segment_ids'] - - if images is not None: - visual_labels = torch.ones((len(batch['input_ids']), IMAGE_LEN), dtype=torch.long) * -100 - batch["labels"] = torch.cat([batch['labels'], visual_labels], dim=1) - - return batch diff --git a/magic_pdf/model/sub_modules/layout/layoutlmv3/layoutlmft/data/funsd.py b/magic_pdf/model/sub_modules/layout/layoutlmv3/layoutlmft/data/funsd.py deleted file mode 100644 index 9f34042023042b10d52906d4ba5ca9c87e65a600..0000000000000000000000000000000000000000 --- a/magic_pdf/model/sub_modules/layout/layoutlmv3/layoutlmft/data/funsd.py +++ /dev/null @@ -1,136 +0,0 @@ -# coding=utf-8 -''' -Reference: https://huggingface.co/datasets/nielsr/funsd/blob/main/funsd.py -''' -import json -import os - -import datasets - -from .image_utils import load_image, normalize_bbox - - -logger = datasets.logging.get_logger(__name__) - - -_CITATION = """\ -@article{Jaume2019FUNSDAD, - title={FUNSD: A Dataset for Form Understanding in Noisy Scanned Documents}, - author={Guillaume Jaume and H. K. Ekenel and J. Thiran}, - journal={2019 International Conference on Document Analysis and Recognition Workshops (ICDARW)}, - year={2019}, - volume={2}, - pages={1-6} -} -""" - -_DESCRIPTION = """\ -https://guillaumejaume.github.io/FUNSD/ -""" - - -class FunsdConfig(datasets.BuilderConfig): - """BuilderConfig for FUNSD""" - - def __init__(self, **kwargs): - """BuilderConfig for FUNSD. - - Args: - **kwargs: keyword arguments forwarded to super. - """ - super(FunsdConfig, self).__init__(**kwargs) - - -class Funsd(datasets.GeneratorBasedBuilder): - """Conll2003 dataset.""" - - BUILDER_CONFIGS = [ - FunsdConfig(name="funsd", version=datasets.Version("1.0.0"), description="FUNSD dataset"), - ] - - def _info(self): - return datasets.DatasetInfo( - description=_DESCRIPTION, - features=datasets.Features( - { - "id": datasets.Value("string"), - "tokens": datasets.Sequence(datasets.Value("string")), - "bboxes": datasets.Sequence(datasets.Sequence(datasets.Value("int64"))), - "ner_tags": datasets.Sequence( - datasets.features.ClassLabel( - names=["O", "B-HEADER", "I-HEADER", "B-QUESTION", "I-QUESTION", "B-ANSWER", "I-ANSWER"] - ) - ), - "image": datasets.Array3D(shape=(3, 224, 224), dtype="uint8"), - "image_path": datasets.Value("string"), - } - ), - supervised_keys=None, - homepage="https://guillaumejaume.github.io/FUNSD/", - citation=_CITATION, - ) - - def _split_generators(self, dl_manager): - """Returns SplitGenerators.""" - downloaded_file = dl_manager.download_and_extract("https://guillaumejaume.github.io/FUNSD/dataset.zip") - return [ - datasets.SplitGenerator( - name=datasets.Split.TRAIN, gen_kwargs={"filepath": f"{downloaded_file}/dataset/training_data/"} - ), - datasets.SplitGenerator( - name=datasets.Split.TEST, gen_kwargs={"filepath": f"{downloaded_file}/dataset/testing_data/"} - ), - ] - - def get_line_bbox(self, bboxs): - x = [bboxs[i][j] for i in range(len(bboxs)) for j in range(0, len(bboxs[i]), 2)] - y = [bboxs[i][j] for i in range(len(bboxs)) for j in range(1, len(bboxs[i]), 2)] - - x0, y0, x1, y1 = min(x), min(y), max(x), max(y) - - assert x1 >= x0 and y1 >= y0 - bbox = [[x0, y0, x1, y1] for _ in range(len(bboxs))] - return bbox - - def _generate_examples(self, filepath): - logger.info("⏳ Generating examples from = %s", filepath) - ann_dir = os.path.join(filepath, "annotations") - img_dir = os.path.join(filepath, "images") - for guid, file in enumerate(sorted(os.listdir(ann_dir))): - tokens = [] - bboxes = [] - ner_tags = [] - - file_path = os.path.join(ann_dir, file) - with open(file_path, "r", encoding="utf8") as f: - data = json.load(f) - image_path = os.path.join(img_dir, file) - image_path = image_path.replace("json", "png") - image, size = load_image(image_path) - for item in data["form"]: - cur_line_bboxes = [] - words, label = item["words"], item["label"] - words = [w for w in words if w["text"].strip() != ""] - if len(words) == 0: - continue - if label == "other": - for w in words: - tokens.append(w["text"]) - ner_tags.append("O") - cur_line_bboxes.append(normalize_bbox(w["box"], size)) - else: - tokens.append(words[0]["text"]) - ner_tags.append("B-" + label.upper()) - cur_line_bboxes.append(normalize_bbox(words[0]["box"], size)) - for w in words[1:]: - tokens.append(w["text"]) - ner_tags.append("I-" + label.upper()) - cur_line_bboxes.append(normalize_bbox(w["box"], size)) - # by default: --segment_level_layout 1 - # if do not want to use segment_level_layout, comment the following line - cur_line_bboxes = self.get_line_bbox(cur_line_bboxes) - # box = normalize_bbox(item["box"], size) - # cur_line_bboxes = [box for _ in range(len(words))] - bboxes.extend(cur_line_bboxes) - yield guid, {"id": str(guid), "tokens": tokens, "bboxes": bboxes, "ner_tags": ner_tags, - "image": image, "image_path": image_path} \ No newline at end of file diff --git a/magic_pdf/model/sub_modules/layout/layoutlmv3/layoutlmft/data/image_utils.py b/magic_pdf/model/sub_modules/layout/layoutlmv3/layoutlmft/data/image_utils.py deleted file mode 100644 index 90a4b34373980246d6397b95b91e84461f3f2580..0000000000000000000000000000000000000000 --- a/magic_pdf/model/sub_modules/layout/layoutlmv3/layoutlmft/data/image_utils.py +++ /dev/null @@ -1,284 +0,0 @@ -import torchvision.transforms.functional as F -import warnings -import math -import random -import numpy as np -from PIL import Image -import torch - -from detectron2.data.detection_utils import read_image -from detectron2.data.transforms import ResizeTransform, TransformList - -def normalize_bbox(bbox, size): - return [ - int(1000 * bbox[0] / size[0]), - int(1000 * bbox[1] / size[1]), - int(1000 * bbox[2] / size[0]), - int(1000 * bbox[3] / size[1]), - ] - - -def load_image(image_path): - image = read_image(image_path, format="BGR") - h = image.shape[0] - w = image.shape[1] - img_trans = TransformList([ResizeTransform(h=h, w=w, new_h=224, new_w=224)]) - image = torch.tensor(img_trans.apply_image(image).copy()).permute(2, 0, 1) # copy to make it writeable - return image, (w, h) - - -def crop(image, i, j, h, w, boxes=None): - cropped_image = F.crop(image, i, j, h, w) - - if boxes is not None: - # Currently we cannot use this case since when some boxes is out of the cropped image, - # it may be better to drop out these boxes along with their text input (instead of min or clamp) - # which haven't been implemented here - max_size = torch.as_tensor([w, h], dtype=torch.float32) - cropped_boxes = torch.as_tensor(boxes) - torch.as_tensor([j, i, j, i]) - cropped_boxes = torch.min(cropped_boxes.reshape(-1, 2, 2), max_size) - cropped_boxes = cropped_boxes.clamp(min=0) - boxes = cropped_boxes.reshape(-1, 4) - - return cropped_image, boxes - - -def resize(image, size, interpolation, boxes=None): - # It seems that we do not need to resize boxes here, since the boxes will be resized to 1000x1000 finally, - # which is compatible with a square image size of 224x224 - rescaled_image = F.resize(image, size, interpolation) - - if boxes is None: - return rescaled_image, None - - ratios = tuple(float(s) / float(s_orig) for s, s_orig in zip(rescaled_image.size, image.size)) - ratio_width, ratio_height = ratios - - # boxes = boxes.copy() - scaled_boxes = boxes * torch.as_tensor([ratio_width, ratio_height, ratio_width, ratio_height]) - - return rescaled_image, scaled_boxes - - -def clamp(num, min_value, max_value): - return max(min(num, max_value), min_value) - - -def get_bb(bb, page_size): - bbs = [float(j) for j in bb] - xs, ys = [], [] - for i, b in enumerate(bbs): - if i % 2 == 0: - xs.append(b) - else: - ys.append(b) - (width, height) = page_size - return_bb = [ - clamp(min(xs), 0, width - 1), - clamp(min(ys), 0, height - 1), - clamp(max(xs), 0, width - 1), - clamp(max(ys), 0, height - 1), - ] - return_bb = [ - int(1000 * return_bb[0] / width), - int(1000 * return_bb[1] / height), - int(1000 * return_bb[2] / width), - int(1000 * return_bb[3] / height), - ] - return return_bb - - -class ToNumpy: - - def __call__(self, pil_img): - np_img = np.array(pil_img, dtype=np.uint8) - if np_img.ndim < 3: - np_img = np.expand_dims(np_img, axis=-1) - np_img = np.rollaxis(np_img, 2) # HWC to CHW - return np_img - - -class ToTensor: - - def __init__(self, dtype=torch.float32): - self.dtype = dtype - - def __call__(self, pil_img): - np_img = np.array(pil_img, dtype=np.uint8) - if np_img.ndim < 3: - np_img = np.expand_dims(np_img, axis=-1) - np_img = np.rollaxis(np_img, 2) # HWC to CHW - return torch.from_numpy(np_img).to(dtype=self.dtype) - - -_pil_interpolation_to_str = { - F.InterpolationMode.NEAREST: 'F.InterpolationMode.NEAREST', - F.InterpolationMode.BILINEAR: 'F.InterpolationMode.BILINEAR', - F.InterpolationMode.BICUBIC: 'F.InterpolationMode.BICUBIC', - F.InterpolationMode.LANCZOS: 'F.InterpolationMode.LANCZOS', - F.InterpolationMode.HAMMING: 'F.InterpolationMode.HAMMING', - F.InterpolationMode.BOX: 'F.InterpolationMode.BOX', -} - - -def _pil_interp(method): - if method == 'bicubic': - return F.InterpolationMode.BICUBIC - elif method == 'lanczos': - return F.InterpolationMode.LANCZOS - elif method == 'hamming': - return F.InterpolationMode.HAMMING - else: - # default bilinear, do we want to allow nearest? - return F.InterpolationMode.BILINEAR - - -class Compose: - """Composes several transforms together. This transform does not support torchscript. - Please, see the note below. - - Args: - transforms (list of ``Transform`` objects): list of transforms to compose. - - Example: - >>> transforms.Compose([ - >>> transforms.CenterCrop(10), - >>> transforms.PILToTensor(), - >>> transforms.ConvertImageDtype(torch.float), - >>> ]) - - .. note:: - In order to script the transformations, please use ``torch.nn.Sequential`` as below. - - >>> transforms = torch.nn.Sequential( - >>> transforms.CenterCrop(10), - >>> transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)), - >>> ) - >>> scripted_transforms = torch.jit.script(transforms) - - Make sure to use only scriptable transformations, i.e. that work with ``torch.Tensor``, does not require - `lambda` functions or ``PIL.Image``. - - """ - - def __init__(self, transforms): - self.transforms = transforms - - def __call__(self, img, augmentation=False, box=None): - for t in self.transforms: - img = t(img, augmentation, box) - return img - - -class RandomResizedCropAndInterpolationWithTwoPic: - """Crop the given PIL Image to random size and aspect ratio with random interpolation. - A crop of random size (default: of 0.08 to 1.0) of the original size and a random - aspect ratio (default: of 3/4 to 4/3) of the original aspect ratio is made. This crop - is finally resized to given size. - This is popularly used to train the Inception networks. - Args: - size: expected output size of each edge - scale: range of size of the origin size cropped - ratio: range of aspect ratio of the origin aspect ratio cropped - interpolation: Default: PIL.Image.BILINEAR - """ - - def __init__(self, size, second_size=None, scale=(0.08, 1.0), ratio=(3. / 4., 4. / 3.), - interpolation='bilinear', second_interpolation='lanczos'): - if isinstance(size, tuple): - self.size = size - else: - self.size = (size, size) - if second_size is not None: - if isinstance(second_size, tuple): - self.second_size = second_size - else: - self.second_size = (second_size, second_size) - else: - self.second_size = None - if (scale[0] > scale[1]) or (ratio[0] > ratio[1]): - warnings.warn("range should be of kind (min, max)") - - self.interpolation = _pil_interp(interpolation) - self.second_interpolation = _pil_interp(second_interpolation) - self.scale = scale - self.ratio = ratio - - @staticmethod - def get_params(img, scale, ratio): - """Get parameters for ``crop`` for a random sized crop. - Args: - img (PIL Image): Image to be cropped. - scale (tuple): range of size of the origin size cropped - ratio (tuple): range of aspect ratio of the origin aspect ratio cropped - Returns: - tuple: params (i, j, h, w) to be passed to ``crop`` for a random - sized crop. - """ - area = img.size[0] * img.size[1] - - for attempt in range(10): - target_area = random.uniform(*scale) * area - log_ratio = (math.log(ratio[0]), math.log(ratio[1])) - aspect_ratio = math.exp(random.uniform(*log_ratio)) - - w = int(round(math.sqrt(target_area * aspect_ratio))) - h = int(round(math.sqrt(target_area / aspect_ratio))) - - if w <= img.size[0] and h <= img.size[1]: - i = random.randint(0, img.size[1] - h) - j = random.randint(0, img.size[0] - w) - return i, j, h, w - - # Fallback to central crop - in_ratio = img.size[0] / img.size[1] - if in_ratio < min(ratio): - w = img.size[0] - h = int(round(w / min(ratio))) - elif in_ratio > max(ratio): - h = img.size[1] - w = int(round(h * max(ratio))) - else: # whole image - w = img.size[0] - h = img.size[1] - i = (img.size[1] - h) // 2 - j = (img.size[0] - w) // 2 - return i, j, h, w - - def __call__(self, img, augmentation=False, box=None): - """ - Args: - img (PIL Image): Image to be cropped and resized. - Returns: - PIL Image: Randomly cropped and resized image. - """ - if augmentation: - i, j, h, w = self.get_params(img, self.scale, self.ratio) - img = F.crop(img, i, j, h, w) - # img, box = crop(img, i, j, h, w, box) - img = F.resize(img, self.size, self.interpolation) - second_img = F.resize(img, self.second_size, self.second_interpolation) \ - if self.second_size is not None else None - return img, second_img - - def __repr__(self): - if isinstance(self.interpolation, (tuple, list)): - interpolate_str = ' '.join([_pil_interpolation_to_str[x] for x in self.interpolation]) - else: - interpolate_str = _pil_interpolation_to_str[self.interpolation] - format_string = self.__class__.__name__ + '(size={0}'.format(self.size) - format_string += ', scale={0}'.format(tuple(round(s, 4) for s in self.scale)) - format_string += ', ratio={0}'.format(tuple(round(r, 4) for r in self.ratio)) - format_string += ', interpolation={0}'.format(interpolate_str) - if self.second_size is not None: - format_string += ', second_size={0}'.format(self.second_size) - format_string += ', second_interpolation={0}'.format(_pil_interpolation_to_str[self.second_interpolation]) - format_string += ')' - return format_string - - -def pil_loader(path: str) -> Image.Image: - # open path as file to avoid ResourceWarning (https://github.com/python-pillow/Pillow/issues/835) - with open(path, 'rb') as f: - img = Image.open(f) - return img.convert('RGB') diff --git a/magic_pdf/model/sub_modules/layout/layoutlmv3/layoutlmft/data/xfund.py b/magic_pdf/model/sub_modules/layout/layoutlmv3/layoutlmft/data/xfund.py deleted file mode 100644 index 7749ba5dd1d59a4e0c5baf4f2c27cffaae3e4e12..0000000000000000000000000000000000000000 --- a/magic_pdf/model/sub_modules/layout/layoutlmv3/layoutlmft/data/xfund.py +++ /dev/null @@ -1,213 +0,0 @@ -import os -import json - -import torch -from torch.utils.data.dataset import Dataset -from torchvision import transforms -from PIL import Image - -from .image_utils import Compose, RandomResizedCropAndInterpolationWithTwoPic - -XFund_label2ids = { - "O":0, - 'B-HEADER':1, - 'I-HEADER':2, - 'B-QUESTION':3, - 'I-QUESTION':4, - 'B-ANSWER':5, - 'I-ANSWER':6, -} - -class xfund_dataset(Dataset): - def box_norm(self, box, width, height): - def clip(min_num, num, max_num): - return min(max(num, min_num), max_num) - - x0, y0, x1, y1 = box - x0 = clip(0, int((x0 / width) * 1000), 1000) - y0 = clip(0, int((y0 / height) * 1000), 1000) - x1 = clip(0, int((x1 / width) * 1000), 1000) - y1 = clip(0, int((y1 / height) * 1000), 1000) - assert x1 >= x0 - assert y1 >= y0 - return [x0, y0, x1, y1] - - def get_segment_ids(self, bboxs): - segment_ids = [] - for i in range(len(bboxs)): - if i == 0: - segment_ids.append(0) - else: - if bboxs[i - 1] == bboxs[i]: - segment_ids.append(segment_ids[-1]) - else: - segment_ids.append(segment_ids[-1] + 1) - return segment_ids - - def get_position_ids(self, segment_ids): - position_ids = [] - for i in range(len(segment_ids)): - if i == 0: - position_ids.append(2) - else: - if segment_ids[i] == segment_ids[i - 1]: - position_ids.append(position_ids[-1] + 1) - else: - position_ids.append(2) - return position_ids - - def load_data( - self, - data_file, - ): - # re-org data format - total_data = {"id": [], "lines": [], "bboxes": [], "ner_tags": [], "image_path": []} - for i in range(len(data_file['documents'])): - width, height = data_file['documents'][i]['img']['width'], data_file['documents'][i]['img'][ - 'height'] - - cur_doc_lines, cur_doc_bboxes, cur_doc_ner_tags, cur_doc_image_path = [], [], [], [] - for j in range(len(data_file['documents'][i]['document'])): - cur_item = data_file['documents'][i]['document'][j] - cur_doc_lines.append(cur_item['text']) - cur_doc_bboxes.append(self.box_norm(cur_item['box'], width=width, height=height)) - cur_doc_ner_tags.append(cur_item['label']) - total_data['id'] += [len(total_data['id'])] - total_data['lines'] += [cur_doc_lines] - total_data['bboxes'] += [cur_doc_bboxes] - total_data['ner_tags'] += [cur_doc_ner_tags] - total_data['image_path'] += [data_file['documents'][i]['img']['fname']] - - # tokenize text and get bbox/label - total_input_ids, total_bboxs, total_label_ids = [], [], [] - for i in range(len(total_data['lines'])): - cur_doc_input_ids, cur_doc_bboxs, cur_doc_labels = [], [], [] - for j in range(len(total_data['lines'][i])): - cur_input_ids = self.tokenizer(total_data['lines'][i][j], truncation=False, add_special_tokens=False, return_attention_mask=False)['input_ids'] - if len(cur_input_ids) == 0: continue - - cur_label = total_data['ner_tags'][i][j].upper() - if cur_label == 'OTHER': - cur_labels = ["O"] * len(cur_input_ids) - for k in range(len(cur_labels)): - cur_labels[k] = self.label2ids[cur_labels[k]] - else: - cur_labels = [cur_label] * len(cur_input_ids) - cur_labels[0] = self.label2ids['B-' + cur_labels[0]] - for k in range(1, len(cur_labels)): - cur_labels[k] = self.label2ids['I-' + cur_labels[k]] - assert len(cur_input_ids) == len([total_data['bboxes'][i][j]] * len(cur_input_ids)) == len(cur_labels) - cur_doc_input_ids += cur_input_ids - cur_doc_bboxs += [total_data['bboxes'][i][j]] * len(cur_input_ids) - cur_doc_labels += cur_labels - assert len(cur_doc_input_ids) == len(cur_doc_bboxs) == len(cur_doc_labels) - assert len(cur_doc_input_ids) > 0 - - total_input_ids.append(cur_doc_input_ids) - total_bboxs.append(cur_doc_bboxs) - total_label_ids.append(cur_doc_labels) - assert len(total_input_ids) == len(total_bboxs) == len(total_label_ids) - - # split text to several slices because of over-length - input_ids, bboxs, labels = [], [], [] - segment_ids, position_ids = [], [] - image_path = [] - for i in range(len(total_input_ids)): - start = 0 - cur_iter = 0 - while start < len(total_input_ids[i]): - end = min(start + 510, len(total_input_ids[i])) - - input_ids.append([self.tokenizer.cls_token_id] + total_input_ids[i][start: end] + [self.tokenizer.sep_token_id]) - bboxs.append([[0, 0, 0, 0]] + total_bboxs[i][start: end] + [[1000, 1000, 1000, 1000]]) - labels.append([-100] + total_label_ids[i][start: end] + [-100]) - - cur_segment_ids = self.get_segment_ids(bboxs[-1]) - cur_position_ids = self.get_position_ids(cur_segment_ids) - segment_ids.append(cur_segment_ids) - position_ids.append(cur_position_ids) - image_path.append(os.path.join(self.args.data_dir, "images", total_data['image_path'][i])) - - start = end - cur_iter += 1 - - assert len(input_ids) == len(bboxs) == len(labels) == len(segment_ids) == len(position_ids) - assert len(segment_ids) == len(image_path) - - res = { - 'input_ids': input_ids, - 'bbox': bboxs, - 'labels': labels, - 'segment_ids': segment_ids, - 'position_ids': position_ids, - 'image_path': image_path, - } - return res - - def __init__( - self, - args, - tokenizer, - mode - ): - self.args = args - self.mode = mode - self.cur_la = args.language - self.tokenizer = tokenizer - self.label2ids = XFund_label2ids - - - self.common_transform = Compose([ - RandomResizedCropAndInterpolationWithTwoPic( - size=args.input_size, interpolation=args.train_interpolation, - ), - ]) - - self.patch_transform = transforms.Compose([ - transforms.ToTensor(), - transforms.Normalize( - mean=torch.tensor((0.5, 0.5, 0.5)), - std=torch.tensor((0.5, 0.5, 0.5))) - ]) - - data_file = json.load( - open(os.path.join(args.data_dir, "{}.{}.json".format(self.cur_la, 'train' if mode == 'train' else 'val')), - 'r')) - - self.feature = self.load_data(data_file) - - def __len__(self): - return len(self.feature['input_ids']) - - def __getitem__(self, index): - input_ids = self.feature["input_ids"][index] - - # attention_mask = self.feature["attention_mask"][index] - attention_mask = [1] * len(input_ids) - labels = self.feature["labels"][index] - bbox = self.feature["bbox"][index] - segment_ids = self.feature['segment_ids'][index] - position_ids = self.feature['position_ids'][index] - - img = pil_loader(self.feature['image_path'][index]) - for_patches, _ = self.common_transform(img, augmentation=False) - patch = self.patch_transform(for_patches) - - assert len(input_ids) == len(attention_mask) == len(labels) == len(bbox) == len(segment_ids) - - res = { - "input_ids": input_ids, - "attention_mask": attention_mask, - "labels": labels, - "bbox": bbox, - "segment_ids": segment_ids, - "position_ids": position_ids, - "images": patch, - } - return res - -def pil_loader(path: str) -> Image.Image: - # open path as file to avoid ResourceWarning (https://github.com/python-pillow/Pillow/issues/835) - with open(path, 'rb') as f: - img = Image.open(f) - return img.convert('RGB') \ No newline at end of file diff --git a/magic_pdf/model/sub_modules/layout/layoutlmv3/layoutlmft/models/__init__.py b/magic_pdf/model/sub_modules/layout/layoutlmv3/layoutlmft/models/__init__.py deleted file mode 100644 index 0b3100effb34547bbaba7503288db34374cad9ca..0000000000000000000000000000000000000000 --- a/magic_pdf/model/sub_modules/layout/layoutlmv3/layoutlmft/models/__init__.py +++ /dev/null @@ -1,7 +0,0 @@ -from .layoutlmv3 import ( - LayoutLMv3Config, - LayoutLMv3ForTokenClassification, - LayoutLMv3ForQuestionAnswering, - LayoutLMv3ForSequenceClassification, - LayoutLMv3Tokenizer, -) diff --git a/magic_pdf/model/sub_modules/layout/layoutlmv3/layoutlmft/models/layoutlmv3/__init__.py b/magic_pdf/model/sub_modules/layout/layoutlmv3/layoutlmft/models/layoutlmv3/__init__.py deleted file mode 100644 index e06a24b0ca9971cfe99dc9ef60ce8e495ff406bd..0000000000000000000000000000000000000000 --- a/magic_pdf/model/sub_modules/layout/layoutlmv3/layoutlmft/models/layoutlmv3/__init__.py +++ /dev/null @@ -1,24 +0,0 @@ -from transformers import AutoConfig, AutoModel, AutoModelForTokenClassification, \ - AutoModelForQuestionAnswering, AutoModelForSequenceClassification, AutoTokenizer -from transformers.convert_slow_tokenizer import SLOW_TO_FAST_CONVERTERS, RobertaConverter - -from .configuration_layoutlmv3 import LayoutLMv3Config -from .modeling_layoutlmv3 import ( - LayoutLMv3ForTokenClassification, - LayoutLMv3ForQuestionAnswering, - LayoutLMv3ForSequenceClassification, - LayoutLMv3Model, -) -from .tokenization_layoutlmv3 import LayoutLMv3Tokenizer -from .tokenization_layoutlmv3_fast import LayoutLMv3TokenizerFast - - -#AutoConfig.register("layoutlmv3", LayoutLMv3Config) -#AutoModel.register(LayoutLMv3Config, LayoutLMv3Model) -#AutoModelForTokenClassification.register(LayoutLMv3Config, LayoutLMv3ForTokenClassification) -#AutoModelForQuestionAnswering.register(LayoutLMv3Config, LayoutLMv3ForQuestionAnswering) -#AutoModelForSequenceClassification.register(LayoutLMv3Config, LayoutLMv3ForSequenceClassification) -#AutoTokenizer.register( -# LayoutLMv3Config, slow_tokenizer_class=LayoutLMv3Tokenizer, fast_tokenizer_class=LayoutLMv3TokenizerFast -#) -SLOW_TO_FAST_CONVERTERS.update({"LayoutLMv3Tokenizer": RobertaConverter}) diff --git a/magic_pdf/model/sub_modules/layout/layoutlmv3/layoutlmft/models/layoutlmv3/configuration_layoutlmv3.py b/magic_pdf/model/sub_modules/layout/layoutlmv3/layoutlmft/models/layoutlmv3/configuration_layoutlmv3.py deleted file mode 100644 index d2c7b4d71b4d51504dee8bc10e50ea91bac00270..0000000000000000000000000000000000000000 --- a/magic_pdf/model/sub_modules/layout/layoutlmv3/layoutlmft/models/layoutlmv3/configuration_layoutlmv3.py +++ /dev/null @@ -1,60 +0,0 @@ -# coding=utf-8 -from transformers.models.bert.configuration_bert import BertConfig -from transformers.utils import logging - - -logger = logging.get_logger(__name__) - -LAYOUTLMV3_PRETRAINED_CONFIG_ARCHIVE_MAP = { - "layoutlmv3-base": "https://huggingface.co/microsoft/layoutlmv3-base/resolve/main/config.json", - "layoutlmv3-large": "https://huggingface.co/microsoft/layoutlmv3-large/resolve/main/config.json", - # See all LayoutLMv3 models at https://huggingface.co/models?filter=layoutlmv3 -} - - -class LayoutLMv3Config(BertConfig): - model_type = "layoutlmv3" - - def __init__( - self, - pad_token_id=1, - bos_token_id=0, - eos_token_id=2, - max_2d_position_embeddings=1024, - coordinate_size=None, - shape_size=None, - has_relative_attention_bias=False, - rel_pos_bins=32, - max_rel_pos=128, - has_spatial_attention_bias=False, - rel_2d_pos_bins=64, - max_rel_2d_pos=256, - visual_embed=True, - mim=False, - wpa_task=False, - discrete_vae_weight_path='', - discrete_vae_type='dall-e', - input_size=224, - second_input_size=112, - device='cuda', - **kwargs - ): - """Constructs RobertaConfig.""" - super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs) - self.max_2d_position_embeddings = max_2d_position_embeddings - self.coordinate_size = coordinate_size - self.shape_size = shape_size - self.has_relative_attention_bias = has_relative_attention_bias - self.rel_pos_bins = rel_pos_bins - self.max_rel_pos = max_rel_pos - self.has_spatial_attention_bias = has_spatial_attention_bias - self.rel_2d_pos_bins = rel_2d_pos_bins - self.max_rel_2d_pos = max_rel_2d_pos - self.visual_embed = visual_embed - self.mim = mim - self.wpa_task = wpa_task - self.discrete_vae_weight_path = discrete_vae_weight_path - self.discrete_vae_type = discrete_vae_type - self.input_size = input_size - self.second_input_size = second_input_size - self.device = device diff --git a/magic_pdf/model/sub_modules/layout/layoutlmv3/layoutlmft/models/layoutlmv3/modeling_layoutlmv3.py b/magic_pdf/model/sub_modules/layout/layoutlmv3/layoutlmft/models/layoutlmv3/modeling_layoutlmv3.py deleted file mode 100644 index 113eb8eb1d123a4985c1894e0caab561b19f64c2..0000000000000000000000000000000000000000 --- a/magic_pdf/model/sub_modules/layout/layoutlmv3/layoutlmft/models/layoutlmv3/modeling_layoutlmv3.py +++ /dev/null @@ -1,1282 +0,0 @@ -# coding=utf-8 -# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. -# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""PyTorch LayoutLMv3 model. """ -import math - -import torch -import torch.nn as nn -import torch.nn.functional as F -import torch.utils.checkpoint -from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss - -from transformers import apply_chunking_to_forward -from transformers.modeling_outputs import ( - BaseModelOutputWithPastAndCrossAttentions, - BaseModelOutputWithPoolingAndCrossAttentions, - MaskedLMOutput, - TokenClassifierOutput, - QuestionAnsweringModelOutput, - SequenceClassifierOutput, -) -from transformers.modeling_utils import PreTrainedModel, find_pruneable_heads_and_indices, prune_linear_layer -from transformers.models.roberta.modeling_roberta import ( - RobertaIntermediate, - RobertaLMHead, - RobertaOutput, - RobertaSelfOutput, -) -from transformers.utils import logging - -from .configuration_layoutlmv3 import LayoutLMv3Config -from timm.models.layers import to_2tuple - - -logger = logging.get_logger(__name__) - - -class PatchEmbed(nn.Module): - """ Image to Patch Embedding - """ - def __init__(self, img_size=224, patch_size=16, in_chans=3, embed_dim=768): - super().__init__() - img_size = to_2tuple(img_size) - patch_size = to_2tuple(patch_size) - self.patch_shape = (img_size[0] // patch_size[0], img_size[1] // patch_size[1]) - self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_size, stride=patch_size) - # The following variables are used in detection mycheckpointer.py - self.num_patches = (img_size[1] // patch_size[1]) * (img_size[0] // patch_size[0]) - self.num_patches_w = self.patch_shape[0] - self.num_patches_h = self.patch_shape[1] - - def forward(self, x, position_embedding=None): - x = self.proj(x) - - if position_embedding is not None: - # interpolate the position embedding to the corresponding size - position_embedding = position_embedding.view(1, self.patch_shape[0], self.patch_shape[1], -1).permute(0, 3, 1, 2) - Hp, Wp = x.shape[2], x.shape[3] - position_embedding = F.interpolate(position_embedding, size=(Hp, Wp), mode='bicubic') - x = x + position_embedding - - x = x.flatten(2).transpose(1, 2) - return x - -class LayoutLMv3Embeddings(nn.Module): - """ - Same as BertEmbeddings with a tiny tweak for positional embeddings indexing. - """ - - # Copied from transformers.models.bert.modeling_bert.BertEmbeddings.__init__ - def __init__(self, config): - super().__init__() - self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id) - self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size) - - self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) - self.dropout = nn.Dropout(config.hidden_dropout_prob) - - # position_ids (1, len position emb) is contiguous in memory and exported when serialized - self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1))) - - # End copy - self.padding_idx = config.pad_token_id - self.position_embeddings = nn.Embedding( - config.max_position_embeddings, config.hidden_size, padding_idx=self.padding_idx - ) - - self.x_position_embeddings = nn.Embedding(config.max_2d_position_embeddings, config.coordinate_size) - self.y_position_embeddings = nn.Embedding(config.max_2d_position_embeddings, config.coordinate_size) - self.h_position_embeddings = nn.Embedding(config.max_2d_position_embeddings, config.shape_size) - self.w_position_embeddings = nn.Embedding(config.max_2d_position_embeddings, config.shape_size) - - def _calc_spatial_position_embeddings(self, bbox): - try: - assert torch.all(0 <= bbox) and torch.all(bbox <= 1023) - left_position_embeddings = self.x_position_embeddings(bbox[:, :, 0]) - upper_position_embeddings = self.y_position_embeddings(bbox[:, :, 1]) - right_position_embeddings = self.x_position_embeddings(bbox[:, :, 2]) - lower_position_embeddings = self.y_position_embeddings(bbox[:, :, 3]) - except IndexError as e: - raise IndexError("The :obj:`bbox` coordinate values should be within 0-1000 range.") from e - - h_position_embeddings = self.h_position_embeddings(torch.clip(bbox[:, :, 3] - bbox[:, :, 1], 0, 1023)) - w_position_embeddings = self.w_position_embeddings(torch.clip(bbox[:, :, 2] - bbox[:, :, 0], 0, 1023)) - - # below is the difference between LayoutLMEmbeddingsV2 (torch.cat) and LayoutLMEmbeddingsV1 (add) - spatial_position_embeddings = torch.cat( - [ - left_position_embeddings, - upper_position_embeddings, - right_position_embeddings, - lower_position_embeddings, - h_position_embeddings, - w_position_embeddings, - ], - dim=-1, - ) - return spatial_position_embeddings - - def create_position_ids_from_input_ids(self, input_ids, padding_idx, past_key_values_length=0): - """ - Replace non-padding symbols with their position numbers. Position numbers begin at padding_idx+1. Padding symbols - are ignored. This is modified from fairseq's `utils.make_positions`. - - Args: - x: torch.Tensor x: - - Returns: torch.Tensor - """ - # The series of casts and type-conversions here are carefully balanced to both work with ONNX export and XLA. - mask = input_ids.ne(padding_idx).int() - incremental_indices = (torch.cumsum(mask, dim=1).type_as(mask) + past_key_values_length) * mask - return incremental_indices.long() + padding_idx - - def forward( - self, - input_ids=None, - bbox=None, - token_type_ids=None, - position_ids=None, - inputs_embeds=None, - past_key_values_length=0, - ): - if position_ids is None: - if input_ids is not None: - # Create the position ids from the input token ids. Any padded tokens remain padded. - position_ids = self.create_position_ids_from_input_ids( - input_ids, self.padding_idx, past_key_values_length).to(input_ids.device) - else: - position_ids = self.create_position_ids_from_inputs_embeds(inputs_embeds) - - if input_ids is not None: - input_shape = input_ids.size() - else: - input_shape = inputs_embeds.size()[:-1] - - if token_type_ids is None: - token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=self.position_ids.device) - - if inputs_embeds is None: - inputs_embeds = self.word_embeddings(input_ids) - token_type_embeddings = self.token_type_embeddings(token_type_ids) - - embeddings = inputs_embeds + token_type_embeddings - position_embeddings = self.position_embeddings(position_ids) - embeddings += position_embeddings - - spatial_position_embeddings = self._calc_spatial_position_embeddings(bbox) - - embeddings = embeddings + spatial_position_embeddings - - embeddings = self.LayerNorm(embeddings) - embeddings = self.dropout(embeddings) - return embeddings - - def create_position_ids_from_inputs_embeds(self, inputs_embeds): - """ - We are provided embeddings directly. We cannot infer which are padded so just generate sequential position ids. - - Args: - inputs_embeds: torch.Tensor≈ - - Returns: torch.Tensor - """ - input_shape = inputs_embeds.size()[:-1] - sequence_length = input_shape[1] - - position_ids = torch.arange( - self.padding_idx + 1, sequence_length + self.padding_idx + 1, dtype=torch.long, device=inputs_embeds.device - ) - return position_ids.unsqueeze(0).expand(input_shape) - - -class LayoutLMv3PreTrainedModel(PreTrainedModel): - """ - An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained - models. - """ - - config_class = LayoutLMv3Config - base_model_prefix = "layoutlmv3" - - # Copied from transformers.models.bert.modeling_bert.BertPreTrainedModel._init_weights - def _init_weights(self, module): - """Initialize the weights""" - if isinstance(module, nn.Linear): - # Slightly different from the TF version which uses truncated_normal for initialization - # cf https://github.com/pytorch/pytorch/pull/5617 - module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) - if module.bias is not None: - module.bias.data.zero_() - elif isinstance(module, nn.Embedding): - module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) - if module.padding_idx is not None: - module.weight.data[module.padding_idx].zero_() - elif isinstance(module, nn.LayerNorm): - module.bias.data.zero_() - module.weight.data.fill_(1.0) - - -class LayoutLMv3SelfAttention(nn.Module): - def __init__(self, config): - super().__init__() - if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"): - raise ValueError( - f"The hidden size ({config.hidden_size}) is not a multiple of the number of attention " - f"heads ({config.num_attention_heads})" - ) - - self.num_attention_heads = config.num_attention_heads - self.attention_head_size = int(config.hidden_size / config.num_attention_heads) - self.all_head_size = self.num_attention_heads * self.attention_head_size - - self.query = nn.Linear(config.hidden_size, self.all_head_size) - self.key = nn.Linear(config.hidden_size, self.all_head_size) - self.value = nn.Linear(config.hidden_size, self.all_head_size) - - self.dropout = nn.Dropout(config.attention_probs_dropout_prob) - self.has_relative_attention_bias = config.has_relative_attention_bias - self.has_spatial_attention_bias = config.has_spatial_attention_bias - - def transpose_for_scores(self, x): - new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size) - x = x.view(*new_x_shape) - return x.permute(0, 2, 1, 3) - - def cogview_attn(self, attention_scores, alpha=32): - ''' - https://arxiv.org/pdf/2105.13290.pdf - Section 2.4 Stabilization of training: Precision Bottleneck Relaxation (PB-Relax). - A replacement of the original nn.Softmax(dim=-1)(attention_scores) - Seems the new attention_probs will result in a slower speed and a little bias - Can use torch.allclose(standard_attention_probs, cogview_attention_probs, atol=1e-08) for comparison - The smaller atol (e.g., 1e-08), the better. - ''' - scaled_attention_scores = attention_scores / alpha - max_value = scaled_attention_scores.amax(dim=(-1)).unsqueeze(-1) - # max_value = scaled_attention_scores.amax(dim=(-2, -1)).unsqueeze(-1).unsqueeze(-1) - new_attention_scores = (scaled_attention_scores - max_value) * alpha - return nn.Softmax(dim=-1)(new_attention_scores) - - def forward( - self, - hidden_states, - attention_mask=None, - head_mask=None, - encoder_hidden_states=None, - encoder_attention_mask=None, - past_key_value=None, - output_attentions=False, - rel_pos=None, - rel_2d_pos=None, - ): - mixed_query_layer = self.query(hidden_states) - - # If this is instantiated as a cross-attention module, the keys - # and values come from an encoder; the attention mask needs to be - # such that the encoder's padding tokens are not attended to. - is_cross_attention = encoder_hidden_states is not None - - if is_cross_attention and past_key_value is not None: - # reuse k,v, cross_attentions - key_layer = past_key_value[0] - value_layer = past_key_value[1] - attention_mask = encoder_attention_mask - elif is_cross_attention: - key_layer = self.transpose_for_scores(self.key(encoder_hidden_states)) - value_layer = self.transpose_for_scores(self.value(encoder_hidden_states)) - attention_mask = encoder_attention_mask - elif past_key_value is not None: - key_layer = self.transpose_for_scores(self.key(hidden_states)) - value_layer = self.transpose_for_scores(self.value(hidden_states)) - key_layer = torch.cat([past_key_value[0], key_layer], dim=2) - value_layer = torch.cat([past_key_value[1], value_layer], dim=2) - else: - key_layer = self.transpose_for_scores(self.key(hidden_states)) - value_layer = self.transpose_for_scores(self.value(hidden_states)) - - query_layer = self.transpose_for_scores(mixed_query_layer) - - # Take the dot product between "query" and "key" to get the raw attention scores. - # The attention scores QT K/√d could be significantly larger than input elements, and result in overflow. - # Changing the computational order into QT(K/√d) alleviates the problem. (https://arxiv.org/pdf/2105.13290.pdf) - attention_scores = torch.matmul(query_layer / math.sqrt(self.attention_head_size), key_layer.transpose(-1, -2)) - - if self.has_relative_attention_bias and self.has_spatial_attention_bias: - attention_scores += (rel_pos + rel_2d_pos) / math.sqrt(self.attention_head_size) - elif self.has_relative_attention_bias: - attention_scores += rel_pos / math.sqrt(self.attention_head_size) - - # if self.has_relative_attention_bias: - # attention_scores += rel_pos - # if self.has_spatial_attention_bias: - # attention_scores += rel_2d_pos - - # attention_scores = attention_scores / math.sqrt(self.attention_head_size) - if attention_mask is not None: - # Apply the attention mask is (precomputed for all layers in RobertaModel forward() function) - attention_scores = attention_scores + attention_mask - - # Normalize the attention scores to probabilities. - # attention_probs = nn.Softmax(dim=-1)(attention_scores) # comment the line below and use this line for speedup - attention_probs = self.cogview_attn(attention_scores) # to stablize training - # assert torch.allclose(attention_probs, nn.Softmax(dim=-1)(attention_scores), atol=1e-8) - - # This is actually dropping out entire tokens to attend to, which might - # seem a bit unusual, but is taken from the original Transformer paper. - attention_probs = self.dropout(attention_probs) - - # Mask heads if we want to - if head_mask is not None: - attention_probs = attention_probs * head_mask - - context_layer = torch.matmul(attention_probs, value_layer) - - context_layer = context_layer.permute(0, 2, 1, 3).contiguous() - new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,) - context_layer = context_layer.view(*new_context_layer_shape) - - outputs = (context_layer, attention_probs) if output_attentions else (context_layer,) - - return outputs - - -class LayoutLMv3Attention(nn.Module): - def __init__(self, config): - super().__init__() - self.self = LayoutLMv3SelfAttention(config) - self.output = RobertaSelfOutput(config) - self.pruned_heads = set() - - def prune_heads(self, heads): - if len(heads) == 0: - return - heads, index = find_pruneable_heads_and_indices( - heads, self.self.num_attention_heads, self.self.attention_head_size, self.pruned_heads - ) - - # Prune linear layers - self.self.query = prune_linear_layer(self.self.query, index) - self.self.key = prune_linear_layer(self.self.key, index) - self.self.value = prune_linear_layer(self.self.value, index) - self.output.dense = prune_linear_layer(self.output.dense, index, dim=1) - - # Update hyper params and store pruned heads - self.self.num_attention_heads = self.self.num_attention_heads - len(heads) - self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads - self.pruned_heads = self.pruned_heads.union(heads) - - def forward( - self, - hidden_states, - attention_mask=None, - head_mask=None, - encoder_hidden_states=None, - encoder_attention_mask=None, - past_key_value=None, - output_attentions=False, - rel_pos=None, - rel_2d_pos=None, - ): - self_outputs = self.self( - hidden_states, - attention_mask, - head_mask, - encoder_hidden_states, - encoder_attention_mask, - past_key_value, - output_attentions, - rel_pos=rel_pos, - rel_2d_pos=rel_2d_pos, - ) - attention_output = self.output(self_outputs[0], hidden_states) - outputs = (attention_output,) + self_outputs[1:] # add attentions if we output them - return outputs - - -class LayoutLMv3Layer(nn.Module): - def __init__(self, config): - super().__init__() - self.chunk_size_feed_forward = config.chunk_size_feed_forward - self.seq_len_dim = 1 - self.attention = LayoutLMv3Attention(config) - assert not config.is_decoder and not config.add_cross_attention, \ - "This version do not support decoder. Please refer to RoBERTa for implementation of is_decoder." - self.intermediate = RobertaIntermediate(config) - self.output = RobertaOutput(config) - - def forward( - self, - hidden_states, - attention_mask=None, - head_mask=None, - encoder_hidden_states=None, - encoder_attention_mask=None, - past_key_value=None, - output_attentions=False, - rel_pos=None, - rel_2d_pos=None, - ): - # decoder uni-directional self-attention cached key/values tuple is at positions 1,2 - self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None - self_attention_outputs = self.attention( - hidden_states, - attention_mask, - head_mask, - output_attentions=output_attentions, - past_key_value=self_attn_past_key_value, - rel_pos=rel_pos, - rel_2d_pos=rel_2d_pos, - ) - attention_output = self_attention_outputs[0] - - outputs = self_attention_outputs[1:] # add self attentions if we output attention weights - - layer_output = apply_chunking_to_forward( - self.feed_forward_chunk, self.chunk_size_feed_forward, self.seq_len_dim, attention_output - ) - outputs = (layer_output,) + outputs - - return outputs - - def feed_forward_chunk(self, attention_output): - intermediate_output = self.intermediate(attention_output) - layer_output = self.output(intermediate_output, attention_output) - return layer_output - - -class LayoutLMv3Encoder(nn.Module): - def __init__(self, config, detection=False, out_features=None): - super().__init__() - self.config = config - self.detection = detection - self.layer = nn.ModuleList([LayoutLMv3Layer(config) for _ in range(config.num_hidden_layers)]) - self.gradient_checkpointing = False - - self.has_relative_attention_bias = config.has_relative_attention_bias - self.has_spatial_attention_bias = config.has_spatial_attention_bias - - if self.has_relative_attention_bias: - self.rel_pos_bins = config.rel_pos_bins - self.max_rel_pos = config.max_rel_pos - self.rel_pos_onehot_size = config.rel_pos_bins - self.rel_pos_bias = nn.Linear(self.rel_pos_onehot_size, config.num_attention_heads, bias=False) - - if self.has_spatial_attention_bias: - self.max_rel_2d_pos = config.max_rel_2d_pos - self.rel_2d_pos_bins = config.rel_2d_pos_bins - self.rel_2d_pos_onehot_size = config.rel_2d_pos_bins - self.rel_pos_x_bias = nn.Linear(self.rel_2d_pos_onehot_size, config.num_attention_heads, bias=False) - self.rel_pos_y_bias = nn.Linear(self.rel_2d_pos_onehot_size, config.num_attention_heads, bias=False) - - if self.detection: - self.gradient_checkpointing = True - embed_dim = self.config.hidden_size - self.out_features = out_features - self.out_indices = [int(name[5:]) for name in out_features] - self.fpn1 = nn.Sequential( - nn.ConvTranspose2d(embed_dim, embed_dim, kernel_size=2, stride=2), - # nn.SyncBatchNorm(embed_dim), - nn.BatchNorm2d(embed_dim), - nn.GELU(), - nn.ConvTranspose2d(embed_dim, embed_dim, kernel_size=2, stride=2), - ) - - self.fpn2 = nn.Sequential( - nn.ConvTranspose2d(embed_dim, embed_dim, kernel_size=2, stride=2), - ) - - self.fpn3 = nn.Identity() - - self.fpn4 = nn.MaxPool2d(kernel_size=2, stride=2) - self.ops = [self.fpn1, self.fpn2, self.fpn3, self.fpn4] - - def relative_position_bucket(self, relative_position, bidirectional=True, num_buckets=32, max_distance=128): - ret = 0 - if bidirectional: - num_buckets //= 2 - ret += (relative_position > 0).long() * num_buckets - n = torch.abs(relative_position) - else: - n = torch.max(-relative_position, torch.zeros_like(relative_position)) - # now n is in the range [0, inf) - - # half of the buckets are for exact increments in positions - max_exact = num_buckets // 2 - is_small = n < max_exact - - # The other half of the buckets are for logarithmically bigger bins in positions up to max_distance - val_if_large = max_exact + ( - torch.log(n.float() / max_exact) / math.log(max_distance / max_exact) * (num_buckets - max_exact) - ).to(torch.long) - val_if_large = torch.min(val_if_large, torch.full_like(val_if_large, num_buckets - 1)) - - ret += torch.where(is_small, n, val_if_large) - return ret - - def _cal_1d_pos_emb(self, hidden_states, position_ids, valid_span): - VISUAL_NUM = 196 + 1 - - rel_pos_mat = position_ids.unsqueeze(-2) - position_ids.unsqueeze(-1) - - if valid_span is not None: - # for the text part, if two words are not in the same line, - # set their distance to the max value (position_ids.shape[-1]) - rel_pos_mat[(rel_pos_mat > 0) & (valid_span == False)] = position_ids.shape[1] - rel_pos_mat[(rel_pos_mat < 0) & (valid_span == False)] = -position_ids.shape[1] - - # image-text, minimum distance - rel_pos_mat[:, -VISUAL_NUM:, :-VISUAL_NUM] = 0 - rel_pos_mat[:, :-VISUAL_NUM, -VISUAL_NUM:] = 0 - - rel_pos = self.relative_position_bucket( - rel_pos_mat, - num_buckets=self.rel_pos_bins, - max_distance=self.max_rel_pos, - ) - rel_pos = F.one_hot(rel_pos, num_classes=self.rel_pos_onehot_size).type_as(hidden_states) - rel_pos = self.rel_pos_bias(rel_pos).permute(0, 3, 1, 2) - rel_pos = rel_pos.contiguous() - return rel_pos - - def _cal_2d_pos_emb(self, hidden_states, bbox): - position_coord_x = bbox[:, :, 0] - position_coord_y = bbox[:, :, 3] - rel_pos_x_2d_mat = position_coord_x.unsqueeze(-2) - position_coord_x.unsqueeze(-1) - rel_pos_y_2d_mat = position_coord_y.unsqueeze(-2) - position_coord_y.unsqueeze(-1) - rel_pos_x = self.relative_position_bucket( - rel_pos_x_2d_mat, - num_buckets=self.rel_2d_pos_bins, - max_distance=self.max_rel_2d_pos, - ) - rel_pos_y = self.relative_position_bucket( - rel_pos_y_2d_mat, - num_buckets=self.rel_2d_pos_bins, - max_distance=self.max_rel_2d_pos, - ) - rel_pos_x = F.one_hot(rel_pos_x, num_classes=self.rel_2d_pos_onehot_size).type_as(hidden_states) - rel_pos_y = F.one_hot(rel_pos_y, num_classes=self.rel_2d_pos_onehot_size).type_as(hidden_states) - rel_pos_x = self.rel_pos_x_bias(rel_pos_x).permute(0, 3, 1, 2) - rel_pos_y = self.rel_pos_y_bias(rel_pos_y).permute(0, 3, 1, 2) - rel_pos_x = rel_pos_x.contiguous() - rel_pos_y = rel_pos_y.contiguous() - rel_2d_pos = rel_pos_x + rel_pos_y - return rel_2d_pos - - def forward( - self, - hidden_states, - bbox=None, - attention_mask=None, - head_mask=None, - encoder_hidden_states=None, - encoder_attention_mask=None, - past_key_values=None, - use_cache=None, - output_attentions=False, - output_hidden_states=False, - return_dict=True, - position_ids=None, - Hp=None, - Wp=None, - valid_span=None, - ): - all_hidden_states = () if output_hidden_states else None - all_self_attentions = () if output_attentions else None - all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None - - next_decoder_cache = () if use_cache else None - - rel_pos = self._cal_1d_pos_emb(hidden_states, position_ids, valid_span) if self.has_relative_attention_bias else None - rel_2d_pos = self._cal_2d_pos_emb(hidden_states, bbox) if self.has_spatial_attention_bias else None - - if self.detection: - feat_out = {} - j = 0 - - for i, layer_module in enumerate(self.layer): - if output_hidden_states: - all_hidden_states = all_hidden_states + (hidden_states,) - - layer_head_mask = head_mask[i] if head_mask is not None else None - past_key_value = past_key_values[i] if past_key_values is not None else None - - if self.gradient_checkpointing and self.training: - - if use_cache: - logger.warning( - "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..." - ) - use_cache = False - - def create_custom_forward(module): - def custom_forward(*inputs): - return module(*inputs) - # return module(*inputs, past_key_value, output_attentions, rel_pos, rel_2d_pos) - # The above line will cause error: - # RuntimeError: Trying to backward through the graph a second time - # (or directly access saved tensors after they have already been freed). - return custom_forward - - layer_outputs = torch.utils.checkpoint.checkpoint( - create_custom_forward(layer_module), - hidden_states, - attention_mask, - layer_head_mask, - encoder_hidden_states, - encoder_attention_mask, - past_key_value, - output_attentions, - rel_pos, - rel_2d_pos - ) - else: - layer_outputs = layer_module( - hidden_states, - attention_mask, - layer_head_mask, - encoder_hidden_states, - encoder_attention_mask, - past_key_value, - output_attentions, - rel_pos=rel_pos, - rel_2d_pos=rel_2d_pos, - ) - - hidden_states = layer_outputs[0] - if use_cache: - next_decoder_cache += (layer_outputs[-1],) - if output_attentions: - all_self_attentions = all_self_attentions + (layer_outputs[1],) - if self.config.add_cross_attention: - all_cross_attentions = all_cross_attentions + (layer_outputs[2],) - - if self.detection and i in self.out_indices: - xp = hidden_states[:, -Hp*Wp:, :].permute(0, 2, 1).reshape(len(hidden_states), -1, Hp, Wp) - feat_out[self.out_features[j]] = self.ops[j](xp.contiguous()) - j += 1 - - if self.detection: - return feat_out - - if output_hidden_states: - all_hidden_states = all_hidden_states + (hidden_states,) - - if not return_dict: - return tuple( - v - for v in [ - hidden_states, - next_decoder_cache, - all_hidden_states, - all_self_attentions, - all_cross_attentions, - ] - if v is not None - ) - return BaseModelOutputWithPastAndCrossAttentions( - last_hidden_state=hidden_states, - past_key_values=next_decoder_cache, - hidden_states=all_hidden_states, - attentions=all_self_attentions, - cross_attentions=all_cross_attentions, - ) - - -class LayoutLMv3Model(LayoutLMv3PreTrainedModel): - """ - """ - - _keys_to_ignore_on_load_missing = [r"position_ids"] - - # Copied from transformers.models.bert.modeling_bert.BertModel.__init__ with Bert->Roberta - def __init__(self, config, detection=False, out_features=None, image_only=False): - super().__init__(config) - self.config = config - assert not config.is_decoder and not config.add_cross_attention, \ - "This version do not support decoder. Please refer to RoBERTa for implementation of is_decoder." - self.detection = detection - if not self.detection: - self.image_only = False - else: - assert config.visual_embed - self.image_only = image_only - - if not self.image_only: - self.embeddings = LayoutLMv3Embeddings(config) - self.encoder = LayoutLMv3Encoder(config, detection=detection, out_features=out_features) - - if config.visual_embed: - embed_dim = self.config.hidden_size - # use the default pre-training parameters for fine-tuning (e.g., input_size) - # when the input_size is larger in fine-tuning, we will interpolate the position embedding in forward - self.patch_embed = PatchEmbed(embed_dim=embed_dim) - - patch_size = 16 - size = int(self.config.input_size / patch_size) - self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dim)) - self.pos_embed = nn.Parameter(torch.zeros(1, size * size + 1, embed_dim)) - self.pos_drop = nn.Dropout(p=0.) - - self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) - self.dropout = nn.Dropout(config.hidden_dropout_prob) - - if self.config.has_relative_attention_bias or self.config.has_spatial_attention_bias: - self._init_visual_bbox(img_size=(size, size)) - - from functools import partial - norm_layer = partial(nn.LayerNorm, eps=1e-6) - self.norm = norm_layer(embed_dim) - - self.init_weights() - - def get_input_embeddings(self): - return self.embeddings.word_embeddings - - def set_input_embeddings(self, value): - self.embeddings.word_embeddings = value - - def _prune_heads(self, heads_to_prune): - """ - Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base - class PreTrainedModel - """ - for layer, heads in heads_to_prune.items(): - self.encoder.layer[layer].attention.prune_heads(heads) - - def _init_visual_bbox(self, img_size=(14, 14), max_len=1000): - visual_bbox_x = torch.div(torch.arange(0, max_len * (img_size[1] + 1), max_len), - img_size[1], rounding_mode='trunc') - visual_bbox_y = torch.div(torch.arange(0, max_len * (img_size[0] + 1), max_len), - img_size[0], rounding_mode='trunc') - visual_bbox = torch.stack( - [ - visual_bbox_x[:-1].repeat(img_size[0], 1), - visual_bbox_y[:-1].repeat(img_size[1], 1).transpose(0, 1), - visual_bbox_x[1:].repeat(img_size[0], 1), - visual_bbox_y[1:].repeat(img_size[1], 1).transpose(0, 1), - ], - dim=-1, - ).view(-1, 4) - - cls_token_box = torch.tensor([[0 + 1, 0 + 1, max_len - 1, max_len - 1]]) - self.visual_bbox = torch.cat([cls_token_box, visual_bbox], dim=0) - - def _calc_visual_bbox(self, device, dtype, bsz): # , img_size=(14, 14), max_len=1000): - visual_bbox = self.visual_bbox.repeat(bsz, 1, 1) - visual_bbox = visual_bbox.to(device).type(dtype) - return visual_bbox - - def forward_image(self, x): - if self.detection: - x = self.patch_embed(x, self.pos_embed[:, 1:, :] if self.pos_embed is not None else None) - else: - x = self.patch_embed(x) - batch_size, seq_len, _ = x.size() - - cls_tokens = self.cls_token.expand(batch_size, -1, -1) # stole cls_tokens impl from Phil Wang, thanks - if self.pos_embed is not None and self.detection: - cls_tokens = cls_tokens + self.pos_embed[:, :1, :] - - x = torch.cat((cls_tokens, x), dim=1) - if self.pos_embed is not None and not self.detection: - x = x + self.pos_embed - x = self.pos_drop(x) - - x = self.norm(x) - return x - - # Copied from transformers.models.bert.modeling_bert.BertModel.forward - def forward( - self, - input_ids=None, - bbox=None, - attention_mask=None, - token_type_ids=None, - valid_span=None, - position_ids=None, - head_mask=None, - inputs_embeds=None, - encoder_hidden_states=None, - encoder_attention_mask=None, - past_key_values=None, - use_cache=None, - output_attentions=None, - output_hidden_states=None, - return_dict=None, - images=None, - ): - r""" - encoder_hidden_states (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`): - Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if - the model is configured as a decoder. - encoder_attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`): - Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in - the cross-attention if the model is configured as a decoder. Mask values selected in ``[0, 1]``: - - - 1 for tokens that are **not masked**, - - 0 for tokens that are **masked**. - past_key_values (:obj:`tuple(tuple(torch.FloatTensor))` of length :obj:`config.n_layers` with each tuple having 4 tensors of shape :obj:`(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`): - Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding. - - If :obj:`past_key_values` are used, the user can optionally input only the last :obj:`decoder_input_ids` - (those that don't have their past key value states given to this model) of shape :obj:`(batch_size, 1)` - instead of all :obj:`decoder_input_ids` of shape :obj:`(batch_size, sequence_length)`. - use_cache (:obj:`bool`, `optional`): - If set to :obj:`True`, :obj:`past_key_values` key value states are returned and can be used to speed up - decoding (see :obj:`past_key_values`). - """ - output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions - output_hidden_states = ( - output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states - ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - - use_cache = False - - # if input_ids is not None and inputs_embeds is not None: - # raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") - if input_ids is not None: - input_shape = input_ids.size() - batch_size, seq_length = input_shape - device = input_ids.device - elif inputs_embeds is not None: - input_shape = inputs_embeds.size()[:-1] - batch_size, seq_length = input_shape - device = inputs_embeds.device - elif images is not None: - batch_size = len(images) - device = images.device - else: - raise ValueError("You have to specify either input_ids or inputs_embeds or images") - - if not self.image_only: - # past_key_values_length - past_key_values_length = past_key_values[0][0].shape[2] if past_key_values is not None else 0 - - if attention_mask is None: - attention_mask = torch.ones(((batch_size, seq_length + past_key_values_length)), device=device) - if token_type_ids is None: - token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device) - - # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length] - # ourselves in which case we just need to make it broadcastable to all heads. - # extended_attention_mask: torch.Tensor = self.get_extended_attention_mask(attention_mask, input_shape, device) - - encoder_extended_attention_mask = None - - # Prepare head mask if needed - # 1.0 in head_mask indicate we keep the head - # attention_probs has shape bsz x n_heads x N x N - # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads] - # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length] - head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers) - - if not self.image_only: - if bbox is None: - bbox = torch.zeros(tuple(list(input_shape) + [4]), dtype=torch.long, device=device) - - embedding_output = self.embeddings( - input_ids=input_ids, - bbox=bbox, - position_ids=position_ids, - token_type_ids=token_type_ids, - inputs_embeds=inputs_embeds, - past_key_values_length=past_key_values_length, - ) - - final_bbox = final_position_ids = None - Hp = Wp = None - if images is not None: - patch_size = 16 - Hp, Wp = int(images.shape[2] / patch_size), int(images.shape[3] / patch_size) - visual_emb = self.forward_image(images) - if self.detection: - visual_attention_mask = torch.ones((batch_size, visual_emb.shape[1]), dtype=torch.long, device=device) - if self.image_only: - attention_mask = visual_attention_mask - else: - attention_mask = torch.cat([attention_mask, visual_attention_mask], dim=1) - elif self.image_only: - attention_mask = torch.ones((batch_size, visual_emb.shape[1]), dtype=torch.long, device=device) - - if self.config.has_relative_attention_bias or self.config.has_spatial_attention_bias: - if self.config.has_spatial_attention_bias: - visual_bbox = self._calc_visual_bbox(device, dtype=torch.long, bsz=batch_size) - if self.image_only: - final_bbox = visual_bbox - else: - final_bbox = torch.cat([bbox, visual_bbox], dim=1) - - visual_position_ids = torch.arange(0, visual_emb.shape[1], dtype=torch.long, device=device).repeat( - batch_size, 1) - if self.image_only: - final_position_ids = visual_position_ids - else: - position_ids = torch.arange(0, input_shape[1], device=device).unsqueeze(0) - position_ids = position_ids.expand_as(input_ids) - final_position_ids = torch.cat([position_ids, visual_position_ids], dim=1) - - if self.image_only: - embedding_output = visual_emb - else: - embedding_output = torch.cat([embedding_output, visual_emb], dim=1) - embedding_output = self.LayerNorm(embedding_output) - embedding_output = self.dropout(embedding_output) - elif self.config.has_relative_attention_bias or self.config.has_spatial_attention_bias: - if self.config.has_spatial_attention_bias: - final_bbox = bbox - if self.config.has_relative_attention_bias: - position_ids = self.embeddings.position_ids[:, :input_shape[1]] - position_ids = position_ids.expand_as(input_ids) - final_position_ids = position_ids - - extended_attention_mask: torch.Tensor = self.get_extended_attention_mask(attention_mask, None, device) - - encoder_outputs = self.encoder( - embedding_output, - bbox=final_bbox, - position_ids=final_position_ids, - attention_mask=extended_attention_mask, - head_mask=head_mask, - encoder_hidden_states=encoder_hidden_states, - encoder_attention_mask=encoder_extended_attention_mask, - past_key_values=past_key_values, - use_cache=use_cache, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, - return_dict=return_dict, - Hp=Hp, - Wp=Wp, - valid_span=valid_span, - ) - - if self.detection: - return encoder_outputs - - sequence_output = encoder_outputs[0] - pooled_output = None - - if not return_dict: - return (sequence_output, pooled_output) + encoder_outputs[1:] - - return BaseModelOutputWithPoolingAndCrossAttentions( - last_hidden_state=sequence_output, - pooler_output=pooled_output, - past_key_values=encoder_outputs.past_key_values, - hidden_states=encoder_outputs.hidden_states, - attentions=encoder_outputs.attentions, - cross_attentions=encoder_outputs.cross_attentions, - ) - - -class LayoutLMv3ClassificationHead(nn.Module): - """ - Head for sentence-level classification tasks. - Reference: RobertaClassificationHead - """ - - def __init__(self, config, pool_feature=False): - super().__init__() - self.pool_feature = pool_feature - if pool_feature: - self.dense = nn.Linear(config.hidden_size*3, config.hidden_size) - else: - self.dense = nn.Linear(config.hidden_size, config.hidden_size) - classifier_dropout = ( - config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob - ) - self.dropout = nn.Dropout(classifier_dropout) - self.out_proj = nn.Linear(config.hidden_size, config.num_labels) - - def forward(self, x): - # x = features[:, 0, :] # take token (equiv. to [CLS]) - x = self.dropout(x) - x = self.dense(x) - x = torch.tanh(x) - x = self.dropout(x) - x = self.out_proj(x) - return x - - -class LayoutLMv3ForTokenClassification(LayoutLMv3PreTrainedModel): - _keys_to_ignore_on_load_unexpected = [r"pooler"] - _keys_to_ignore_on_load_missing = [r"position_ids"] - - def __init__(self, config): - super().__init__(config) - self.num_labels = config.num_labels - - self.layoutlmv3 = LayoutLMv3Model(config) - self.dropout = nn.Dropout(config.hidden_dropout_prob) - if config.num_labels < 10: - self.classifier = nn.Linear(config.hidden_size, config.num_labels) - else: - self.classifier = LayoutLMv3ClassificationHead(config, pool_feature=False) - - self.init_weights() - - def forward( - self, - input_ids=None, - bbox=None, - attention_mask=None, - token_type_ids=None, - position_ids=None, - valid_span=None, - head_mask=None, - inputs_embeds=None, - labels=None, - output_attentions=None, - output_hidden_states=None, - return_dict=None, - images=None, - ): - r""" - labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`): - Labels for computing the token classification loss. Indices should be in ``[0, ..., config.num_labels - - 1]``. - """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - - outputs = self.layoutlmv3( - input_ids, - bbox=bbox, - attention_mask=attention_mask, - token_type_ids=token_type_ids, - position_ids=position_ids, - head_mask=head_mask, - inputs_embeds=inputs_embeds, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, - return_dict=return_dict, - images=images, - valid_span=valid_span, - ) - - sequence_output = outputs[0] - - sequence_output = self.dropout(sequence_output) - logits = self.classifier(sequence_output) - - loss = None - if labels is not None: - loss_fct = CrossEntropyLoss() - # Only keep active parts of the loss - if attention_mask is not None: - active_loss = attention_mask.view(-1) == 1 - active_logits = logits.view(-1, self.num_labels) - active_labels = torch.where( - active_loss, labels.view(-1), torch.tensor(loss_fct.ignore_index).type_as(labels) - ) - loss = loss_fct(active_logits, active_labels) - else: - loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) - - if not return_dict: - output = (logits,) + outputs[2:] - return ((loss,) + output) if loss is not None else output - - return TokenClassifierOutput( - loss=loss, - logits=logits, - hidden_states=outputs.hidden_states, - attentions=outputs.attentions, - ) - - -class LayoutLMv3ForQuestionAnswering(LayoutLMv3PreTrainedModel): - _keys_to_ignore_on_load_unexpected = [r"pooler"] - _keys_to_ignore_on_load_missing = [r"position_ids"] - - def __init__(self, config): - super().__init__(config) - self.num_labels = config.num_labels - - self.layoutlmv3 = LayoutLMv3Model(config) - # self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels) - self.qa_outputs = LayoutLMv3ClassificationHead(config, pool_feature=False) - - self.init_weights() - - def forward( - self, - input_ids=None, - attention_mask=None, - token_type_ids=None, - position_ids=None, - valid_span=None, - head_mask=None, - inputs_embeds=None, - start_positions=None, - end_positions=None, - output_attentions=None, - output_hidden_states=None, - return_dict=None, - bbox=None, - images=None, - ): - r""" - start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`): - Labels for position (index) of the start of the labelled span for computing the token classification loss. - Positions are clamped to the length of the sequence (:obj:`sequence_length`). Position outside of the - sequence are not taken into account for computing the loss. - end_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`): - Labels for position (index) of the end of the labelled span for computing the token classification loss. - Positions are clamped to the length of the sequence (:obj:`sequence_length`). Position outside of the - sequence are not taken into account for computing the loss. - """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - - outputs = self.layoutlmv3( - input_ids, - attention_mask=attention_mask, - token_type_ids=token_type_ids, - position_ids=position_ids, - head_mask=head_mask, - inputs_embeds=inputs_embeds, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, - return_dict=return_dict, - bbox=bbox, - images=images, - valid_span=valid_span, - ) - - sequence_output = outputs[0] - - logits = self.qa_outputs(sequence_output) - start_logits, end_logits = logits.split(1, dim=-1) - start_logits = start_logits.squeeze(-1).contiguous() - end_logits = end_logits.squeeze(-1).contiguous() - - total_loss = None - if start_positions is not None and end_positions is not None: - # If we are on multi-GPU, split add a dimension - if len(start_positions.size()) > 1: - start_positions = start_positions.squeeze(-1) - if len(end_positions.size()) > 1: - end_positions = end_positions.squeeze(-1) - # sometimes the start/end positions are outside our model inputs, we ignore these terms - ignored_index = start_logits.size(1) - start_positions = start_positions.clamp(0, ignored_index) - end_positions = end_positions.clamp(0, ignored_index) - - loss_fct = CrossEntropyLoss(ignore_index=ignored_index) - start_loss = loss_fct(start_logits, start_positions) - end_loss = loss_fct(end_logits, end_positions) - total_loss = (start_loss + end_loss) / 2 - - if not return_dict: - output = (start_logits, end_logits) + outputs[2:] - return ((total_loss,) + output) if total_loss is not None else output - - return QuestionAnsweringModelOutput( - loss=total_loss, - start_logits=start_logits, - end_logits=end_logits, - hidden_states=outputs.hidden_states, - attentions=outputs.attentions, - ) - - -class LayoutLMv3ForSequenceClassification(LayoutLMv3PreTrainedModel): - _keys_to_ignore_on_load_missing = [r"position_ids"] - - def __init__(self, config): - super().__init__(config) - self.num_labels = config.num_labels - self.config = config - self.layoutlmv3 = LayoutLMv3Model(config) - self.classifier = LayoutLMv3ClassificationHead(config, pool_feature=False) - - self.init_weights() - - def forward( - self, - input_ids=None, - attention_mask=None, - token_type_ids=None, - position_ids=None, - valid_span=None, - head_mask=None, - inputs_embeds=None, - labels=None, - output_attentions=None, - output_hidden_states=None, - return_dict=None, - bbox=None, - images=None, - ): - r""" - labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`): - Labels for computing the sequence classification/regression loss. Indices should be in :obj:`[0, ..., - config.num_labels - 1]`. If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss), - If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy). - """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - - outputs = self.layoutlmv3( - input_ids, - attention_mask=attention_mask, - token_type_ids=token_type_ids, - position_ids=position_ids, - head_mask=head_mask, - inputs_embeds=inputs_embeds, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, - return_dict=return_dict, - bbox=bbox, - images=images, - valid_span=valid_span, - ) - - sequence_output = outputs[0][:, 0, :] - logits = self.classifier(sequence_output) - - loss = None - if labels is not None: - if self.config.problem_type is None: - if self.num_labels == 1: - self.config.problem_type = "regression" - elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int): - self.config.problem_type = "single_label_classification" - else: - self.config.problem_type = "multi_label_classification" - - if self.config.problem_type == "regression": - loss_fct = MSELoss() - if self.num_labels == 1: - loss = loss_fct(logits.squeeze(), labels.squeeze()) - else: - loss = loss_fct(logits, labels) - elif self.config.problem_type == "single_label_classification": - loss_fct = CrossEntropyLoss() - loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) - elif self.config.problem_type == "multi_label_classification": - loss_fct = BCEWithLogitsLoss() - loss = loss_fct(logits, labels) - - if not return_dict: - output = (logits,) + outputs[2:] - return ((loss,) + output) if loss is not None else output - - return SequenceClassifierOutput( - loss=loss, - logits=logits, - hidden_states=outputs.hidden_states, - attentions=outputs.attentions, - ) diff --git a/magic_pdf/model/sub_modules/layout/layoutlmv3/layoutlmft/models/layoutlmv3/tokenization_layoutlmv3.py b/magic_pdf/model/sub_modules/layout/layoutlmv3/layoutlmft/models/layoutlmv3/tokenization_layoutlmv3.py deleted file mode 100644 index f340d3c6aca04b6567614e6aa221f7c542239305..0000000000000000000000000000000000000000 --- a/magic_pdf/model/sub_modules/layout/layoutlmv3/layoutlmft/models/layoutlmv3/tokenization_layoutlmv3.py +++ /dev/null @@ -1,32 +0,0 @@ -# coding=utf-8 -# Copyright 2018 The Open AI Team Authors and The HuggingFace Inc. team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Tokenization classes for LayoutLMv3, refer to RoBERTa.""" - -from transformers.models.roberta import RobertaTokenizer -from transformers.utils import logging - - -logger = logging.get_logger(__name__) - -VOCAB_FILES_NAMES = { - "vocab_file": "vocab.json", - "merges_file": "merges.txt", -} - -class LayoutLMv3Tokenizer(RobertaTokenizer): - vocab_files_names = VOCAB_FILES_NAMES - # pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP - # max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES - model_input_names = ["input_ids", "attention_mask"] diff --git a/magic_pdf/model/sub_modules/layout/layoutlmv3/layoutlmft/models/layoutlmv3/tokenization_layoutlmv3_fast.py b/magic_pdf/model/sub_modules/layout/layoutlmv3/layoutlmft/models/layoutlmv3/tokenization_layoutlmv3_fast.py deleted file mode 100644 index 9fd75ff1d3bd7725025114e99320afd80823e9d0..0000000000000000000000000000000000000000 --- a/magic_pdf/model/sub_modules/layout/layoutlmv3/layoutlmft/models/layoutlmv3/tokenization_layoutlmv3_fast.py +++ /dev/null @@ -1,34 +0,0 @@ -# coding=utf-8 -# Copyright 2018 The Open AI Team Authors and The HuggingFace Inc. team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Fast Tokenization classes for LayoutLMv3, refer to RoBERTa.""" - - -from transformers.models.roberta.tokenization_roberta_fast import RobertaTokenizerFast -from transformers.utils import logging - -from .tokenization_layoutlmv3 import LayoutLMv3Tokenizer - - -logger = logging.get_logger(__name__) - -VOCAB_FILES_NAMES = {"vocab_file": "vocab.json", "merges_file": "merges.txt", "tokenizer_file": "tokenizer.json"} - - -class LayoutLMv3TokenizerFast(RobertaTokenizerFast): - vocab_files_names = VOCAB_FILES_NAMES - # pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP - # max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES - model_input_names = ["input_ids", "attention_mask"] - slow_tokenizer_class = LayoutLMv3Tokenizer diff --git a/magic_pdf/model/sub_modules/layout/layoutlmv3/model_init.py b/magic_pdf/model/sub_modules/layout/layoutlmv3/model_init.py deleted file mode 100644 index a624d60d5a75902f3c44d3dfbe1ef350cddf7427..0000000000000000000000000000000000000000 --- a/magic_pdf/model/sub_modules/layout/layoutlmv3/model_init.py +++ /dev/null @@ -1,151 +0,0 @@ -from .visualizer import Visualizer -from .rcnn_vl import * -from .backbone import * - -from detectron2.config import get_cfg -from detectron2.config import CfgNode as CN -from detectron2.data import MetadataCatalog, DatasetCatalog -from detectron2.data.datasets import register_coco_instances -from detectron2.engine import DefaultTrainer, default_argument_parser, default_setup, launch, DefaultPredictor - - -def add_vit_config(cfg): - """ - Add config for VIT. - """ - _C = cfg - - _C.MODEL.VIT = CN() - - # CoaT model name. - _C.MODEL.VIT.NAME = "" - - # Output features from CoaT backbone. - _C.MODEL.VIT.OUT_FEATURES = ["layer3", "layer5", "layer7", "layer11"] - - _C.MODEL.VIT.IMG_SIZE = [224, 224] - - _C.MODEL.VIT.POS_TYPE = "shared_rel" - - _C.MODEL.VIT.DROP_PATH = 0. - - _C.MODEL.VIT.MODEL_KWARGS = "{}" - - _C.SOLVER.OPTIMIZER = "ADAMW" - - _C.SOLVER.BACKBONE_MULTIPLIER = 1.0 - - _C.AUG = CN() - - _C.AUG.DETR = False - - _C.MODEL.IMAGE_ONLY = True - _C.PUBLAYNET_DATA_DIR_TRAIN = "" - _C.PUBLAYNET_DATA_DIR_TEST = "" - _C.FOOTNOTE_DATA_DIR_TRAIN = "" - _C.FOOTNOTE_DATA_DIR_VAL = "" - _C.SCIHUB_DATA_DIR_TRAIN = "" - _C.SCIHUB_DATA_DIR_TEST = "" - _C.JIAOCAI_DATA_DIR_TRAIN = "" - _C.JIAOCAI_DATA_DIR_TEST = "" - _C.ICDAR_DATA_DIR_TRAIN = "" - _C.ICDAR_DATA_DIR_TEST = "" - _C.M6DOC_DATA_DIR_TEST = "" - _C.DOCSTRUCTBENCH_DATA_DIR_TEST = "" - _C.DOCSTRUCTBENCHv2_DATA_DIR_TEST = "" - _C.CACHE_DIR = "" - _C.MODEL.CONFIG_PATH = "" - - # effective update steps would be MAX_ITER/GRADIENT_ACCUMULATION_STEPS - # maybe need to set MAX_ITER *= GRADIENT_ACCUMULATION_STEPS - _C.SOLVER.GRADIENT_ACCUMULATION_STEPS = 1 - - -def setup(args, device): - """ - Create configs and perform basic setups. - """ - cfg = get_cfg() - - # add_coat_config(cfg) - add_vit_config(cfg) - cfg.merge_from_file(args.config_file) - cfg.MODEL.ROI_HEADS.SCORE_THRESH_TEST = 0.2 # set threshold for this model - cfg.merge_from_list(args.opts) - - # 使用统一的device配置 - cfg.MODEL.DEVICE = device - - cfg.freeze() - default_setup(cfg, args) - - #@todo 可以删掉这块? - # register_coco_instances( - # "scihub_train", - # {}, - # cfg.SCIHUB_DATA_DIR_TRAIN + ".json", - # cfg.SCIHUB_DATA_DIR_TRAIN - # ) - - return cfg - - -class DotDict(dict): - def __init__(self, *args, **kwargs): - super(DotDict, self).__init__(*args, **kwargs) - - def __getattr__(self, key): - if key not in self.keys(): - return None - value = self[key] - if isinstance(value, dict): - value = DotDict(value) - return value - - def __setattr__(self, key, value): - self[key] = value - - -class Layoutlmv3_Predictor(object): - def __init__(self, weights, config_file, device): - layout_args = { - "config_file": config_file, - "resume": False, - "eval_only": False, - "num_gpus": 1, - "num_machines": 1, - "machine_rank": 0, - "dist_url": "tcp://127.0.0.1:57823", - "opts": ["MODEL.WEIGHTS", weights], - } - layout_args = DotDict(layout_args) - - cfg = setup(layout_args, device) - self.mapping = ["title", "plain text", "abandon", "figure", "figure_caption", "table", "table_caption", - "table_footnote", "isolate_formula", "formula_caption"] - MetadataCatalog.get(cfg.DATASETS.TRAIN[0]).thing_classes = self.mapping - self.predictor = DefaultPredictor(cfg) - - def __call__(self, image, ignore_catids=[]): - # page_layout_result = { - # "layout_dets": [] - # } - layout_dets = [] - outputs = self.predictor(image) - boxes = outputs["instances"].to("cpu")._fields["pred_boxes"].tensor.tolist() - labels = outputs["instances"].to("cpu")._fields["pred_classes"].tolist() - scores = outputs["instances"].to("cpu")._fields["scores"].tolist() - for bbox_idx in range(len(boxes)): - if labels[bbox_idx] in ignore_catids: - continue - layout_dets.append({ - "category_id": labels[bbox_idx], - "poly": [ - boxes[bbox_idx][0], boxes[bbox_idx][1], - boxes[bbox_idx][2], boxes[bbox_idx][1], - boxes[bbox_idx][2], boxes[bbox_idx][3], - boxes[bbox_idx][0], boxes[bbox_idx][3], - ], - "score": scores[bbox_idx] - }) - return layout_dets diff --git a/magic_pdf/model/sub_modules/layout/layoutlmv3/rcnn_vl.py b/magic_pdf/model/sub_modules/layout/layoutlmv3/rcnn_vl.py deleted file mode 100644 index 46b2e16102e8782eb675b518b7d870dc8d007ba8..0000000000000000000000000000000000000000 --- a/magic_pdf/model/sub_modules/layout/layoutlmv3/rcnn_vl.py +++ /dev/null @@ -1,163 +0,0 @@ -# Copyright (c) Facebook, Inc. and its affiliates. -import logging -import numpy as np -from typing import Dict, List, Optional, Tuple -import torch -from torch import nn - -from detectron2.config import configurable -from detectron2.structures import ImageList, Instances -from detectron2.utils.events import get_event_storage - -from detectron2.modeling.backbone import Backbone, build_backbone -from detectron2.modeling.meta_arch.build import META_ARCH_REGISTRY - -from detectron2.modeling.meta_arch import GeneralizedRCNN - -from detectron2.modeling.postprocessing import detector_postprocess -from detectron2.modeling.roi_heads.fast_rcnn import fast_rcnn_inference_single_image -from contextlib import contextmanager -from itertools import count - -@META_ARCH_REGISTRY.register() -class VLGeneralizedRCNN(GeneralizedRCNN): - """ - Generalized R-CNN. Any models that contains the following three components: - 1. Per-image feature extraction (aka backbone) - 2. Region proposal generation - 3. Per-region feature extraction and prediction - """ - - def forward(self, batched_inputs: List[Dict[str, torch.Tensor]]): - """ - Args: - batched_inputs: a list, batched outputs of :class:`DatasetMapper` . - Each item in the list contains the inputs for one image. - For now, each item in the list is a dict that contains: - - * image: Tensor, image in (C, H, W) format. - * instances (optional): groundtruth :class:`Instances` - * proposals (optional): :class:`Instances`, precomputed proposals. - - Other information that's included in the original dicts, such as: - - * "height", "width" (int): the output resolution of the model, used in inference. - See :meth:`postprocess` for details. - - Returns: - list[dict]: - Each dict is the output for one input image. - The dict contains one key "instances" whose value is a :class:`Instances`. - The :class:`Instances` object has the following keys: - "pred_boxes", "pred_classes", "scores", "pred_masks", "pred_keypoints" - """ - if not self.training: - return self.inference(batched_inputs) - - images = self.preprocess_image(batched_inputs) - if "instances" in batched_inputs[0]: - gt_instances = [x["instances"].to(self.device) for x in batched_inputs] - else: - gt_instances = None - - # features = self.backbone(images.tensor) - input = self.get_batch(batched_inputs, images) - features = self.backbone(input) - - if self.proposal_generator is not None: - proposals, proposal_losses = self.proposal_generator(images, features, gt_instances) - else: - assert "proposals" in batched_inputs[0] - proposals = [x["proposals"].to(self.device) for x in batched_inputs] - proposal_losses = {} - - _, detector_losses = self.roi_heads(images, features, proposals, gt_instances) - if self.vis_period > 0: - storage = get_event_storage() - if storage.iter % self.vis_period == 0: - self.visualize_training(batched_inputs, proposals) - - losses = {} - losses.update(detector_losses) - losses.update(proposal_losses) - return losses - - def inference( - self, - batched_inputs: List[Dict[str, torch.Tensor]], - detected_instances: Optional[List[Instances]] = None, - do_postprocess: bool = True, - ): - """ - Run inference on the given inputs. - - Args: - batched_inputs (list[dict]): same as in :meth:`forward` - detected_instances (None or list[Instances]): if not None, it - contains an `Instances` object per image. The `Instances` - object contains "pred_boxes" and "pred_classes" which are - known boxes in the image. - The inference will then skip the detection of bounding boxes, - and only predict other per-ROI outputs. - do_postprocess (bool): whether to apply post-processing on the outputs. - - Returns: - When do_postprocess=True, same as in :meth:`forward`. - Otherwise, a list[Instances] containing raw network outputs. - """ - assert not self.training - - images = self.preprocess_image(batched_inputs) - # features = self.backbone(images.tensor) - input = self.get_batch(batched_inputs, images) - features = self.backbone(input) - - if detected_instances is None: - if self.proposal_generator is not None: - proposals, _ = self.proposal_generator(images, features, None) - else: - assert "proposals" in batched_inputs[0] - proposals = [x["proposals"].to(self.device) for x in batched_inputs] - - results, _ = self.roi_heads(images, features, proposals, None) - else: - detected_instances = [x.to(self.device) for x in detected_instances] - results = self.roi_heads.forward_with_given_boxes(features, detected_instances) - - if do_postprocess: - assert not torch.jit.is_scripting(), "Scripting is not supported for postprocess." - return GeneralizedRCNN._postprocess(results, batched_inputs, images.image_sizes) - else: - return results - - def get_batch(self, examples, images): - if len(examples) >= 1 and "bbox" not in examples[0]: # image_only - return {"images": images.tensor} - - return input - - def _batch_inference(self, batched_inputs, detected_instances=None): - """ - Execute inference on a list of inputs, - using batch size = self.batch_size (e.g., 2), instead of the length of the list. - - Inputs & outputs have the same format as :meth:`GeneralizedRCNN.inference` - """ - if detected_instances is None: - detected_instances = [None] * len(batched_inputs) - - outputs = [] - inputs, instances = [], [] - for idx, input, instance in zip(count(), batched_inputs, detected_instances): - inputs.append(input) - instances.append(instance) - if len(inputs) == 2 or idx == len(batched_inputs) - 1: - outputs.extend( - self.inference( - inputs, - instances if instances[0] is not None else None, - do_postprocess=True, # False - ) - ) - inputs, instances = [], [] - return outputs diff --git a/magic_pdf/model/sub_modules/layout/layoutlmv3/visualizer.py b/magic_pdf/model/sub_modules/layout/layoutlmv3/visualizer.py deleted file mode 100644 index 8185984e66f0267be6368317c60dc543dcb69e87..0000000000000000000000000000000000000000 --- a/magic_pdf/model/sub_modules/layout/layoutlmv3/visualizer.py +++ /dev/null @@ -1,1236 +0,0 @@ -# Copyright (c) Facebook, Inc. and its affiliates. -import colorsys -import logging -import math -import numpy as np -from enum import Enum, unique -import cv2 -import matplotlib as mpl -import matplotlib.colors as mplc -import matplotlib.figure as mplfigure -import pycocotools.mask as mask_util -import torch -from matplotlib.backends.backend_agg import FigureCanvasAgg -from PIL import Image - -from detectron2.data import MetadataCatalog -from detectron2.structures import BitMasks, Boxes, BoxMode, Keypoints, PolygonMasks, RotatedBoxes -from detectron2.utils.file_io import PathManager - -from detectron2.utils.colormap import random_color - -import pdb - -logger = logging.getLogger(__name__) - -__all__ = ["ColorMode", "VisImage", "Visualizer"] - - -_SMALL_OBJECT_AREA_THRESH = 1000 -_LARGE_MASK_AREA_THRESH = 120000 -_OFF_WHITE = (1.0, 1.0, 240.0 / 255) -_BLACK = (0, 0, 0) -_RED = (1.0, 0, 0) - -_KEYPOINT_THRESHOLD = 0.05 - -#CLASS_NAMES = ["footnote", "footer", "header"] - -@unique -class ColorMode(Enum): - """ - Enum of different color modes to use for instance visualizations. - """ - - IMAGE = 0 - """ - Picks a random color for every instance and overlay segmentations with low opacity. - """ - SEGMENTATION = 1 - """ - Let instances of the same category have similar colors - (from metadata.thing_colors), and overlay them with - high opacity. This provides more attention on the quality of segmentation. - """ - IMAGE_BW = 2 - """ - Same as IMAGE, but convert all areas without masks to gray-scale. - Only available for drawing per-instance mask predictions. - """ - - -class GenericMask: - """ - Attribute: - polygons (list[ndarray]): list[ndarray]: polygons for this mask. - Each ndarray has format [x, y, x, y, ...] - mask (ndarray): a binary mask - """ - - def __init__(self, mask_or_polygons, height, width): - self._mask = self._polygons = self._has_holes = None - self.height = height - self.width = width - - m = mask_or_polygons - if isinstance(m, dict): - # RLEs - assert "counts" in m and "size" in m - if isinstance(m["counts"], list): # uncompressed RLEs - h, w = m["size"] - assert h == height and w == width - m = mask_util.frPyObjects(m, h, w) - self._mask = mask_util.decode(m)[:, :] - return - - if isinstance(m, list): # list[ndarray] - self._polygons = [np.asarray(x).reshape(-1) for x in m] - return - - if isinstance(m, np.ndarray): # assumed to be a binary mask - assert m.shape[1] != 2, m.shape - assert m.shape == ( - height, - width, - ), f"mask shape: {m.shape}, target dims: {height}, {width}" - self._mask = m.astype("uint8") - return - - raise ValueError("GenericMask cannot handle object {} of type '{}'".format(m, type(m))) - - @property - def mask(self): - if self._mask is None: - self._mask = self.polygons_to_mask(self._polygons) - return self._mask - - @property - def polygons(self): - if self._polygons is None: - self._polygons, self._has_holes = self.mask_to_polygons(self._mask) - return self._polygons - - @property - def has_holes(self): - if self._has_holes is None: - if self._mask is not None: - self._polygons, self._has_holes = self.mask_to_polygons(self._mask) - else: - self._has_holes = False # if original format is polygon, does not have holes - return self._has_holes - - def mask_to_polygons(self, mask): - # cv2.RETR_CCOMP flag retrieves all the contours and arranges them to a 2-level - # hierarchy. External contours (boundary) of the object are placed in hierarchy-1. - # Internal contours (holes) are placed in hierarchy-2. - # cv2.CHAIN_APPROX_NONE flag gets vertices of polygons from contours. - mask = np.ascontiguousarray(mask) # some versions of cv2 does not support incontiguous arr - res = cv2.findContours(mask.astype("uint8"), cv2.RETR_CCOMP, cv2.CHAIN_APPROX_NONE) - hierarchy = res[-1] - if hierarchy is None: # empty mask - return [], False - has_holes = (hierarchy.reshape(-1, 4)[:, 3] >= 0).sum() > 0 - res = res[-2] - res = [x.flatten() for x in res] - # These coordinates from OpenCV are integers in range [0, W-1 or H-1]. - # We add 0.5 to turn them into real-value coordinate space. A better solution - # would be to first +0.5 and then dilate the returned polygon by 0.5. - res = [x + 0.5 for x in res if len(x) >= 6] - return res, has_holes - - def polygons_to_mask(self, polygons): - rle = mask_util.frPyObjects(polygons, self.height, self.width) - rle = mask_util.merge(rle) - return mask_util.decode(rle)[:, :] - - def area(self): - return self.mask.sum() - - def bbox(self): - p = mask_util.frPyObjects(self.polygons, self.height, self.width) - p = mask_util.merge(p) - bbox = mask_util.toBbox(p) - bbox[2] += bbox[0] - bbox[3] += bbox[1] - return bbox - - -class _PanopticPrediction: - """ - Unify different panoptic annotation/prediction formats - """ - - def __init__(self, panoptic_seg, segments_info, metadata=None): - if segments_info is None: - assert metadata is not None - # If "segments_info" is None, we assume "panoptic_img" is a - # H*W int32 image storing the panoptic_id in the format of - # category_id * label_divisor + instance_id. We reserve -1 for - # VOID label. - label_divisor = metadata.label_divisor - segments_info = [] - for panoptic_label in np.unique(panoptic_seg.numpy()): - if panoptic_label == -1: - # VOID region. - continue - pred_class = panoptic_label // label_divisor - isthing = pred_class in metadata.thing_dataset_id_to_contiguous_id.values() - segments_info.append( - { - "id": int(panoptic_label), - "category_id": int(pred_class), - "isthing": bool(isthing), - } - ) - del metadata - - self._seg = panoptic_seg - - self._sinfo = {s["id"]: s for s in segments_info} # seg id -> seg info - segment_ids, areas = torch.unique(panoptic_seg, sorted=True, return_counts=True) - areas = areas.numpy() - sorted_idxs = np.argsort(-areas) - self._seg_ids, self._seg_areas = segment_ids[sorted_idxs], areas[sorted_idxs] - self._seg_ids = self._seg_ids.tolist() - for sid, area in zip(self._seg_ids, self._seg_areas): - if sid in self._sinfo: - self._sinfo[sid]["area"] = float(area) - - def non_empty_mask(self): - """ - Returns: - (H, W) array, a mask for all pixels that have a prediction - """ - empty_ids = [] - for id in self._seg_ids: - if id not in self._sinfo: - empty_ids.append(id) - if len(empty_ids) == 0: - return np.zeros(self._seg.shape, dtype=np.uint8) - assert ( - len(empty_ids) == 1 - ), ">1 ids corresponds to no labels. This is currently not supported" - return (self._seg != empty_ids[0]).numpy().astype(np.bool) - - def semantic_masks(self): - for sid in self._seg_ids: - sinfo = self._sinfo.get(sid) - if sinfo is None or sinfo["isthing"]: - # Some pixels (e.g. id 0 in PanopticFPN) have no instance or semantic predictions. - continue - yield (self._seg == sid).numpy().astype(np.bool), sinfo - - def instance_masks(self): - for sid in self._seg_ids: - sinfo = self._sinfo.get(sid) - if sinfo is None or not sinfo["isthing"]: - continue - mask = (self._seg == sid).numpy().astype(np.bool) - if mask.sum() > 0: - yield mask, sinfo - - -def _create_text_labels(classes, scores, class_names, is_crowd=None): - """ - Args: - classes (list[int] or None): - scores (list[float] or None): - class_names (list[str] or None): - is_crowd (list[bool] or None): - - Returns: - list[str] or None - """ - #class_names = CLASS_NAMES - labels = None - if classes is not None: - if class_names is not None and len(class_names) > 0: - labels = [class_names[i] for i in classes] - else: - labels = [str(i) for i in classes] - - if scores is not None: - if labels is None: - labels = ["{:.0f}%".format(s * 100) for s in scores] - else: - labels = ["{} {:.0f}%".format(l, s * 100) for l, s in zip(labels, scores)] - if labels is not None and is_crowd is not None: - labels = [l + ("|crowd" if crowd else "") for l, crowd in zip(labels, is_crowd)] - return labels - - -class VisImage: - def __init__(self, img, scale=1.0): - """ - Args: - img (ndarray): an RGB image of shape (H, W, 3) in range [0, 255]. - scale (float): scale the input image - """ - self.img = img - self.scale = scale - self.width, self.height = img.shape[1], img.shape[0] - self._setup_figure(img) - - def _setup_figure(self, img): - """ - Args: - Same as in :meth:`__init__()`. - - Returns: - fig (matplotlib.pyplot.figure): top level container for all the image plot elements. - ax (matplotlib.pyplot.Axes): contains figure elements and sets the coordinate system. - """ - fig = mplfigure.Figure(frameon=False) - self.dpi = fig.get_dpi() - # add a small 1e-2 to avoid precision lost due to matplotlib's truncation - # (https://github.com/matplotlib/matplotlib/issues/15363) - fig.set_size_inches( - (self.width * self.scale + 1e-2) / self.dpi, - (self.height * self.scale + 1e-2) / self.dpi, - ) - self.canvas = FigureCanvasAgg(fig) - # self.canvas = mpl.backends.backend_cairo.FigureCanvasCairo(fig) - ax = fig.add_axes([0.0, 0.0, 1.0, 1.0]) - ax.axis("off") - self.fig = fig - self.ax = ax - self.reset_image(img) - - def reset_image(self, img): - """ - Args: - img: same as in __init__ - """ - img = img.astype("uint8") - self.ax.imshow(img, extent=(0, self.width, self.height, 0), interpolation="nearest") - - def save(self, filepath): - """ - Args: - filepath (str): a string that contains the absolute path, including the file name, where - the visualized image will be saved. - """ - self.fig.savefig(filepath) - - def get_image(self): - """ - Returns: - ndarray: - the visualized image of shape (H, W, 3) (RGB) in uint8 type. - The shape is scaled w.r.t the input image using the given `scale` argument. - """ - canvas = self.canvas - s, (width, height) = canvas.print_to_buffer() - # buf = io.BytesIO() # works for cairo backend - # canvas.print_rgba(buf) - # width, height = self.width, self.height - # s = buf.getvalue() - - buffer = np.frombuffer(s, dtype="uint8") - - img_rgba = buffer.reshape(height, width, 4) - rgb, alpha = np.split(img_rgba, [3], axis=2) - return rgb.astype("uint8") - - -class Visualizer: - """ - Visualizer that draws data about detection/segmentation on images. - - It contains methods like `draw_{text,box,circle,line,binary_mask,polygon}` - that draw primitive objects to images, as well as high-level wrappers like - `draw_{instance_predictions,sem_seg,panoptic_seg_predictions,dataset_dict}` - that draw composite data in some pre-defined style. - - Note that the exact visualization style for the high-level wrappers are subject to change. - Style such as color, opacity, label contents, visibility of labels, or even the visibility - of objects themselves (e.g. when the object is too small) may change according - to different heuristics, as long as the results still look visually reasonable. - - To obtain a consistent style, you can implement custom drawing functions with the - abovementioned primitive methods instead. If you need more customized visualization - styles, you can process the data yourself following their format documented in - tutorials (:doc:`/tutorials/models`, :doc:`/tutorials/datasets`). This class does not - intend to satisfy everyone's preference on drawing styles. - - This visualizer focuses on high rendering quality rather than performance. It is not - designed to be used for real-time applications. - """ - - # TODO implement a fast, rasterized version using OpenCV - - def __init__(self, img_rgb, metadata=None, scale=1.0, instance_mode=ColorMode.IMAGE): - """ - Args: - img_rgb: a numpy array of shape (H, W, C), where H and W correspond to - the height and width of the image respectively. C is the number of - color channels. The image is required to be in RGB format since that - is a requirement of the Matplotlib library. The image is also expected - to be in the range [0, 255]. - metadata (Metadata): dataset metadata (e.g. class names and colors) - instance_mode (ColorMode): defines one of the pre-defined style for drawing - instances on an image. - """ - self.img = np.asarray(img_rgb).clip(0, 255).astype(np.uint8) - if metadata is None: - metadata = MetadataCatalog.get("__nonexist__") - self.metadata = metadata - self.output = VisImage(self.img, scale=scale) - self.cpu_device = torch.device("cpu") - - # too small texts are useless, therefore clamp to 9 - self._default_font_size = max( - np.sqrt(self.output.height * self.output.width) // 90, 10 // scale - ) - self._instance_mode = instance_mode - self.keypoint_threshold = _KEYPOINT_THRESHOLD - - def draw_instance_predictions(self, predictions): - """ - Draw instance-level prediction results on an image. - - Args: - predictions (Instances): the output of an instance detection/segmentation - model. Following fields will be used to draw: - "pred_boxes", "pred_classes", "scores", "pred_masks" (or "pred_masks_rle"). - - Returns: - output (VisImage): image object with visualizations. - """ - boxes = predictions.pred_boxes if predictions.has("pred_boxes") else None - scores = predictions.scores if predictions.has("scores") else None - classes = predictions.pred_classes.tolist() if predictions.has("pred_classes") else None - labels = _create_text_labels(classes, scores, self.metadata.get("thing_classes", None)) - keypoints = predictions.pred_keypoints if predictions.has("pred_keypoints") else None - - if predictions.has("pred_masks"): - masks = np.asarray(predictions.pred_masks) - masks = [GenericMask(x, self.output.height, self.output.width) for x in masks] - else: - masks = None - - if self._instance_mode == ColorMode.SEGMENTATION and self.metadata.get("thing_colors"): - colors = [ - self._jitter([x / 255 for x in self.metadata.thing_colors[c]]) for c in classes - ] - alpha = 0.8 - else: - colors = None - alpha = 0.5 - - if self._instance_mode == ColorMode.IMAGE_BW: - self.output.reset_image( - self._create_grayscale_image( - (predictions.pred_masks.any(dim=0) > 0).numpy() - if predictions.has("pred_masks") - else None - ) - ) - alpha = 0.3 - - self.overlay_instances( - masks=masks, - boxes=boxes, - labels=labels, - keypoints=keypoints, - assigned_colors=colors, - alpha=alpha, - ) - return self.output - - def draw_sem_seg(self, sem_seg, area_threshold=None, alpha=0.8): - """ - Draw semantic segmentation predictions/labels. - - Args: - sem_seg (Tensor or ndarray): the segmentation of shape (H, W). - Each value is the integer label of the pixel. - area_threshold (int): segments with less than `area_threshold` are not drawn. - alpha (float): the larger it is, the more opaque the segmentations are. - - Returns: - output (VisImage): image object with visualizations. - """ - if isinstance(sem_seg, torch.Tensor): - sem_seg = sem_seg.numpy() - labels, areas = np.unique(sem_seg, return_counts=True) - sorted_idxs = np.argsort(-areas).tolist() - labels = labels[sorted_idxs] - for label in filter(lambda l: l < len(self.metadata.stuff_classes), labels): - try: - mask_color = [x / 255 for x in self.metadata.stuff_colors[label]] - except (AttributeError, IndexError): - mask_color = None - - binary_mask = (sem_seg == label).astype(np.uint8) - text = self.metadata.stuff_classes[label] - self.draw_binary_mask( - binary_mask, - color=mask_color, - edge_color=_OFF_WHITE, - text=text, - alpha=alpha, - area_threshold=area_threshold, - ) - return self.output - - def draw_panoptic_seg(self, panoptic_seg, segments_info, area_threshold=None, alpha=0.7): - """ - Draw panoptic prediction annotations or results. - - Args: - panoptic_seg (Tensor): of shape (height, width) where the values are ids for each - segment. - segments_info (list[dict] or None): Describe each segment in `panoptic_seg`. - If it is a ``list[dict]``, each dict contains keys "id", "category_id". - If None, category id of each pixel is computed by - ``pixel // metadata.label_divisor``. - area_threshold (int): stuff segments with less than `area_threshold` are not drawn. - - Returns: - output (VisImage): image object with visualizations. - """ - pred = _PanopticPrediction(panoptic_seg, segments_info, self.metadata) - - if self._instance_mode == ColorMode.IMAGE_BW: - self.output.reset_image(self._create_grayscale_image(pred.non_empty_mask())) - - # draw mask for all semantic segments first i.e. "stuff" - for mask, sinfo in pred.semantic_masks(): - category_idx = sinfo["category_id"] - try: - mask_color = [x / 255 for x in self.metadata.stuff_colors[category_idx]] - except AttributeError: - mask_color = None - - text = self.metadata.stuff_classes[category_idx] - self.draw_binary_mask( - mask, - color=mask_color, - edge_color=_OFF_WHITE, - text=text, - alpha=alpha, - area_threshold=area_threshold, - ) - - # draw mask for all instances second - all_instances = list(pred.instance_masks()) - if len(all_instances) == 0: - return self.output - masks, sinfo = list(zip(*all_instances)) - category_ids = [x["category_id"] for x in sinfo] - - try: - scores = [x["score"] for x in sinfo] - except KeyError: - scores = None - labels = _create_text_labels( - category_ids, scores, self.metadata.thing_classes, [x.get("iscrowd", 0) for x in sinfo] - ) - - try: - colors = [ - self._jitter([x / 255 for x in self.metadata.thing_colors[c]]) for c in category_ids - ] - except AttributeError: - colors = None - self.overlay_instances(masks=masks, labels=labels, assigned_colors=colors, alpha=alpha) - - return self.output - - draw_panoptic_seg_predictions = draw_panoptic_seg # backward compatibility - - def draw_dataset_dict(self, dic): - """ - Draw annotations/segmentaions in Detectron2 Dataset format. - - Args: - dic (dict): annotation/segmentation data of one image, in Detectron2 Dataset format. - - Returns: - output (VisImage): image object with visualizations. - """ - annos = dic.get("annotations", None) - if annos: - if "segmentation" in annos[0]: - masks = [x["segmentation"] for x in annos] - else: - masks = None - if "keypoints" in annos[0]: - keypts = [x["keypoints"] for x in annos] - keypts = np.array(keypts).reshape(len(annos), -1, 3) - else: - keypts = None - - boxes = [ - BoxMode.convert(x["bbox"], x["bbox_mode"], BoxMode.XYXY_ABS) - if len(x["bbox"]) == 4 - else x["bbox"] - for x in annos - ] - - colors = None - category_ids = [x["category_id"] for x in annos] - if self._instance_mode == ColorMode.SEGMENTATION and self.metadata.get("thing_colors"): - colors = [ - self._jitter([x / 255 for x in self.metadata.thing_colors[c]]) - for c in category_ids - ] - names = self.metadata.get("thing_classes", None) - labels = _create_text_labels( - category_ids, - scores=None, - class_names=names, - is_crowd=[x.get("iscrowd", 0) for x in annos], - ) - self.overlay_instances( - labels=labels, boxes=boxes, masks=masks, keypoints=keypts, assigned_colors=colors - ) - - sem_seg = dic.get("sem_seg", None) - if sem_seg is None and "sem_seg_file_name" in dic: - with PathManager.open(dic["sem_seg_file_name"], "rb") as f: - sem_seg = Image.open(f) - sem_seg = np.asarray(sem_seg, dtype="uint8") - if sem_seg is not None: - self.draw_sem_seg(sem_seg, area_threshold=0, alpha=0.5) - - pan_seg = dic.get("pan_seg", None) - if pan_seg is None and "pan_seg_file_name" in dic: - with PathManager.open(dic["pan_seg_file_name"], "rb") as f: - pan_seg = Image.open(f) - pan_seg = np.asarray(pan_seg) - from panopticapi.utils import rgb2id - - pan_seg = rgb2id(pan_seg) - if pan_seg is not None: - segments_info = dic["segments_info"] - pan_seg = torch.tensor(pan_seg) - self.draw_panoptic_seg(pan_seg, segments_info, area_threshold=0, alpha=0.5) - return self.output - - def overlay_instances( - self, - *, - boxes=None, - labels=None, - masks=None, - keypoints=None, - assigned_colors=None, - alpha=0.5, - ): - """ - Args: - boxes (Boxes, RotatedBoxes or ndarray): either a :class:`Boxes`, - or an Nx4 numpy array of XYXY_ABS format for the N objects in a single image, - or a :class:`RotatedBoxes`, - or an Nx5 numpy array of (x_center, y_center, width, height, angle_degrees) format - for the N objects in a single image, - labels (list[str]): the text to be displayed for each instance. - masks (masks-like object): Supported types are: - - * :class:`detectron2.structures.PolygonMasks`, - :class:`detectron2.structures.BitMasks`. - * list[list[ndarray]]: contains the segmentation masks for all objects in one image. - The first level of the list corresponds to individual instances. The second - level to all the polygon that compose the instance, and the third level - to the polygon coordinates. The third level should have the format of - [x0, y0, x1, y1, ..., xn, yn] (n >= 3). - * list[ndarray]: each ndarray is a binary mask of shape (H, W). - * list[dict]: each dict is a COCO-style RLE. - keypoints (Keypoint or array like): an array-like object of shape (N, K, 3), - where the N is the number of instances and K is the number of keypoints. - The last dimension corresponds to (x, y, visibility or score). - assigned_colors (list[matplotlib.colors]): a list of colors, where each color - corresponds to each mask or box in the image. Refer to 'matplotlib.colors' - for full list of formats that the colors are accepted in. - - Returns: - output (VisImage): image object with visualizations. - """ - num_instances = 0 - if boxes is not None: - boxes = self._convert_boxes(boxes) - num_instances = len(boxes) - if masks is not None: - masks = self._convert_masks(masks) - if num_instances: - assert len(masks) == num_instances - else: - num_instances = len(masks) - if keypoints is not None: - if num_instances: - assert len(keypoints) == num_instances - else: - num_instances = len(keypoints) - keypoints = self._convert_keypoints(keypoints) - if labels is not None: - assert len(labels) == num_instances - if assigned_colors is None: - assigned_colors = [random_color(rgb=True, maximum=1) for _ in range(num_instances)] - if num_instances == 0: - return self.output - if boxes is not None and boxes.shape[1] == 5: - return self.overlay_rotated_instances( - boxes=boxes, labels=labels, assigned_colors=assigned_colors - ) - - # Display in largest to smallest order to reduce occlusion. - areas = None - if boxes is not None: - areas = np.prod(boxes[:, 2:] - boxes[:, :2], axis=1) - elif masks is not None: - areas = np.asarray([x.area() for x in masks]) - - if areas is not None: - sorted_idxs = np.argsort(-areas).tolist() - # Re-order overlapped instances in descending order. - boxes = boxes[sorted_idxs] if boxes is not None else None - labels = [labels[k] for k in sorted_idxs] if labels is not None else None - masks = [masks[idx] for idx in sorted_idxs] if masks is not None else None - assigned_colors = [assigned_colors[idx] for idx in sorted_idxs] - keypoints = keypoints[sorted_idxs] if keypoints is not None else None - - for i in range(num_instances): - color = assigned_colors[i] - if boxes is not None: - self.draw_box(boxes[i], edge_color=color) - - if masks is not None: - for segment in masks[i].polygons: - self.draw_polygon(segment.reshape(-1, 2), color, alpha=alpha) - - if labels is not None: - # first get a box - if boxes is not None: - x0, y0, x1, y1 = boxes[i] - text_pos = (x0, y0) # if drawing boxes, put text on the box corner. - horiz_align = "left" - elif masks is not None: - # skip small mask without polygon - if len(masks[i].polygons) == 0: - continue - - x0, y0, x1, y1 = masks[i].bbox() - - # draw text in the center (defined by median) when box is not drawn - # median is less sensitive to outliers. - text_pos = np.median(masks[i].mask.nonzero(), axis=1)[::-1] - horiz_align = "center" - else: - continue # drawing the box confidence for keypoints isn't very useful. - # for small objects, draw text at the side to avoid occlusion - instance_area = (y1 - y0) * (x1 - x0) - if ( - instance_area < _SMALL_OBJECT_AREA_THRESH * self.output.scale - or y1 - y0 < 40 * self.output.scale - ): - if y1 >= self.output.height - 5: - text_pos = (x1, y0) - else: - text_pos = (x0, y1) - - height_ratio = (y1 - y0) / np.sqrt(self.output.height * self.output.width) - lighter_color = self._change_color_brightness(color, brightness_factor=0.7) - font_size = ( - np.clip((height_ratio - 0.02) / 0.08 + 1, 1.2, 2) - * 0.5 - * self._default_font_size - ) - self.draw_text( - labels[i], - text_pos, - color=lighter_color, - horizontal_alignment=horiz_align, - font_size=font_size, - ) - - # draw keypoints - if keypoints is not None: - for keypoints_per_instance in keypoints: - self.draw_and_connect_keypoints(keypoints_per_instance) - - return self.output - - def overlay_rotated_instances(self, boxes=None, labels=None, assigned_colors=None): - """ - Args: - boxes (ndarray): an Nx5 numpy array of - (x_center, y_center, width, height, angle_degrees) format - for the N objects in a single image. - labels (list[str]): the text to be displayed for each instance. - assigned_colors (list[matplotlib.colors]): a list of colors, where each color - corresponds to each mask or box in the image. Refer to 'matplotlib.colors' - for full list of formats that the colors are accepted in. - - Returns: - output (VisImage): image object with visualizations. - """ - num_instances = len(boxes) - - if assigned_colors is None: - assigned_colors = [random_color(rgb=True, maximum=1) for _ in range(num_instances)] - if num_instances == 0: - return self.output - - # Display in largest to smallest order to reduce occlusion. - if boxes is not None: - areas = boxes[:, 2] * boxes[:, 3] - - sorted_idxs = np.argsort(-areas).tolist() - # Re-order overlapped instances in descending order. - boxes = boxes[sorted_idxs] - labels = [labels[k] for k in sorted_idxs] if labels is not None else None - colors = [assigned_colors[idx] for idx in sorted_idxs] - - for i in range(num_instances): - self.draw_rotated_box_with_label( - boxes[i], edge_color=colors[i], label=labels[i] if labels is not None else None - ) - - return self.output - - def draw_and_connect_keypoints(self, keypoints): - """ - Draws keypoints of an instance and follows the rules for keypoint connections - to draw lines between appropriate keypoints. This follows color heuristics for - line color. - - Args: - keypoints (Tensor): a tensor of shape (K, 3), where K is the number of keypoints - and the last dimension corresponds to (x, y, probability). - - Returns: - output (VisImage): image object with visualizations. - """ - visible = {} - keypoint_names = self.metadata.get("keypoint_names") - for idx, keypoint in enumerate(keypoints): - # draw keypoint - x, y, prob = keypoint - if prob > self.keypoint_threshold: - self.draw_circle((x, y), color=_RED) - if keypoint_names: - keypoint_name = keypoint_names[idx] - visible[keypoint_name] = (x, y) - - if self.metadata.get("keypoint_connection_rules"): - for kp0, kp1, color in self.metadata.keypoint_connection_rules: - if kp0 in visible and kp1 in visible: - x0, y0 = visible[kp0] - x1, y1 = visible[kp1] - color = tuple(x / 255.0 for x in color) - self.draw_line([x0, x1], [y0, y1], color=color) - - # draw lines from nose to mid-shoulder and mid-shoulder to mid-hip - # Note that this strategy is specific to person keypoints. - # For other keypoints, it should just do nothing - try: - ls_x, ls_y = visible["left_shoulder"] - rs_x, rs_y = visible["right_shoulder"] - mid_shoulder_x, mid_shoulder_y = (ls_x + rs_x) / 2, (ls_y + rs_y) / 2 - except KeyError: - pass - else: - # draw line from nose to mid-shoulder - nose_x, nose_y = visible.get("nose", (None, None)) - if nose_x is not None: - self.draw_line([nose_x, mid_shoulder_x], [nose_y, mid_shoulder_y], color=_RED) - - try: - # draw line from mid-shoulder to mid-hip - lh_x, lh_y = visible["left_hip"] - rh_x, rh_y = visible["right_hip"] - except KeyError: - pass - else: - mid_hip_x, mid_hip_y = (lh_x + rh_x) / 2, (lh_y + rh_y) / 2 - self.draw_line([mid_hip_x, mid_shoulder_x], [mid_hip_y, mid_shoulder_y], color=_RED) - return self.output - - """ - Primitive drawing functions: - """ - - def draw_text( - self, - text, - position, - *, - font_size=None, - color="g", - horizontal_alignment="center", - rotation=0, - ): - """ - Args: - text (str): class label - position (tuple): a tuple of the x and y coordinates to place text on image. - font_size (int, optional): font of the text. If not provided, a font size - proportional to the image width is calculated and used. - color: color of the text. Refer to `matplotlib.colors` for full list - of formats that are accepted. - horizontal_alignment (str): see `matplotlib.text.Text` - rotation: rotation angle in degrees CCW - - Returns: - output (VisImage): image object with text drawn. - """ - if not font_size: - font_size = self._default_font_size - - # since the text background is dark, we don't want the text to be dark - color = np.maximum(list(mplc.to_rgb(color)), 0.2) - color[np.argmax(color)] = max(0.8, np.max(color)) - - x, y = position - self.output.ax.text( - x, - y, - text, - size=font_size * self.output.scale, - family="sans-serif", - bbox={"facecolor": "black", "alpha": 0.8, "pad": 0.7, "edgecolor": "none"}, - verticalalignment="top", - horizontalalignment=horizontal_alignment, - color=color, - zorder=10, - rotation=rotation, - ) - return self.output - - def draw_box(self, box_coord, alpha=0.5, edge_color="g", line_style="-"): - """ - Args: - box_coord (tuple): a tuple containing x0, y0, x1, y1 coordinates, where x0 and y0 - are the coordinates of the image's top left corner. x1 and y1 are the - coordinates of the image's bottom right corner. - alpha (float): blending efficient. Smaller values lead to more transparent masks. - edge_color: color of the outline of the box. Refer to `matplotlib.colors` - for full list of formats that are accepted. - line_style (string): the string to use to create the outline of the boxes. - - Returns: - output (VisImage): image object with box drawn. - """ - x0, y0, x1, y1 = box_coord - width = x1 - x0 - height = y1 - y0 - - linewidth = max(self._default_font_size / 4, 1) - - self.output.ax.add_patch( - mpl.patches.Rectangle( - (x0, y0), - width, - height, - fill=False, - edgecolor=edge_color, - linewidth=linewidth * self.output.scale, - alpha=alpha, - linestyle=line_style, - ) - ) - return self.output - - def draw_rotated_box_with_label( - self, rotated_box, alpha=0.5, edge_color="g", line_style="-", label=None - ): - """ - Draw a rotated box with label on its top-left corner. - - Args: - rotated_box (tuple): a tuple containing (cnt_x, cnt_y, w, h, angle), - where cnt_x and cnt_y are the center coordinates of the box. - w and h are the width and height of the box. angle represents how - many degrees the box is rotated CCW with regard to the 0-degree box. - alpha (float): blending efficient. Smaller values lead to more transparent masks. - edge_color: color of the outline of the box. Refer to `matplotlib.colors` - for full list of formats that are accepted. - line_style (string): the string to use to create the outline of the boxes. - label (string): label for rotated box. It will not be rendered when set to None. - - Returns: - output (VisImage): image object with box drawn. - """ - cnt_x, cnt_y, w, h, angle = rotated_box - area = w * h - # use thinner lines when the box is small - linewidth = self._default_font_size / ( - 6 if area < _SMALL_OBJECT_AREA_THRESH * self.output.scale else 3 - ) - - theta = angle * math.pi / 180.0 - c = math.cos(theta) - s = math.sin(theta) - rect = [(-w / 2, h / 2), (-w / 2, -h / 2), (w / 2, -h / 2), (w / 2, h / 2)] - # x: left->right ; y: top->down - rotated_rect = [(s * yy + c * xx + cnt_x, c * yy - s * xx + cnt_y) for (xx, yy) in rect] - for k in range(4): - j = (k + 1) % 4 - self.draw_line( - [rotated_rect[k][0], rotated_rect[j][0]], - [rotated_rect[k][1], rotated_rect[j][1]], - color=edge_color, - linestyle="--" if k == 1 else line_style, - linewidth=linewidth, - ) - - if label is not None: - text_pos = rotated_rect[1] # topleft corner - - height_ratio = h / np.sqrt(self.output.height * self.output.width) - label_color = self._change_color_brightness(edge_color, brightness_factor=0.7) - font_size = ( - np.clip((height_ratio - 0.02) / 0.08 + 1, 1.2, 2) * 0.5 * self._default_font_size - ) - self.draw_text(label, text_pos, color=label_color, font_size=font_size, rotation=angle) - - return self.output - - def draw_circle(self, circle_coord, color, radius=3): - """ - Args: - circle_coord (list(int) or tuple(int)): contains the x and y coordinates - of the center of the circle. - color: color of the polygon. Refer to `matplotlib.colors` for a full list of - formats that are accepted. - radius (int): radius of the circle. - - Returns: - output (VisImage): image object with box drawn. - """ - x, y = circle_coord - self.output.ax.add_patch( - mpl.patches.Circle(circle_coord, radius=radius, fill=True, color=color) - ) - return self.output - - def draw_line(self, x_data, y_data, color, linestyle="-", linewidth=None): - """ - Args: - x_data (list[int]): a list containing x values of all the points being drawn. - Length of list should match the length of y_data. - y_data (list[int]): a list containing y values of all the points being drawn. - Length of list should match the length of x_data. - color: color of the line. Refer to `matplotlib.colors` for a full list of - formats that are accepted. - linestyle: style of the line. Refer to `matplotlib.lines.Line2D` - for a full list of formats that are accepted. - linewidth (float or None): width of the line. When it's None, - a default value will be computed and used. - - Returns: - output (VisImage): image object with line drawn. - """ - if linewidth is None: - linewidth = self._default_font_size / 3 - linewidth = max(linewidth, 1) - self.output.ax.add_line( - mpl.lines.Line2D( - x_data, - y_data, - linewidth=linewidth * self.output.scale, - color=color, - linestyle=linestyle, - ) - ) - return self.output - - def draw_binary_mask( - self, binary_mask, color=None, *, edge_color=None, text=None, alpha=0.5, area_threshold=0 - ): - """ - Args: - binary_mask (ndarray): numpy array of shape (H, W), where H is the image height and - W is the image width. Each value in the array is either a 0 or 1 value of uint8 - type. - color: color of the mask. Refer to `matplotlib.colors` for a full list of - formats that are accepted. If None, will pick a random color. - edge_color: color of the polygon edges. Refer to `matplotlib.colors` for a - full list of formats that are accepted. - text (str): if None, will be drawn in the object's center of mass. - alpha (float): blending efficient. Smaller values lead to more transparent masks. - area_threshold (float): a connected component small than this will not be shown. - - Returns: - output (VisImage): image object with mask drawn. - """ - if color is None: - color = random_color(rgb=True, maximum=1) - color = mplc.to_rgb(color) - - has_valid_segment = False - binary_mask = binary_mask.astype("uint8") # opencv needs uint8 - mask = GenericMask(binary_mask, self.output.height, self.output.width) - shape2d = (binary_mask.shape[0], binary_mask.shape[1]) - - if not mask.has_holes: - # draw polygons for regular masks - for segment in mask.polygons: - area = mask_util.area(mask_util.frPyObjects([segment], shape2d[0], shape2d[1])) - if area < (area_threshold or 0): - continue - has_valid_segment = True - segment = segment.reshape(-1, 2) - self.draw_polygon(segment, color=color, edge_color=edge_color, alpha=alpha) - else: - # TODO: Use Path/PathPatch to draw vector graphics: - # https://stackoverflow.com/questions/8919719/how-to-plot-a-complex-polygon - rgba = np.zeros(shape2d + (4,), dtype="float32") - rgba[:, :, :3] = color - rgba[:, :, 3] = (mask.mask == 1).astype("float32") * alpha - has_valid_segment = True - self.output.ax.imshow(rgba, extent=(0, self.output.width, self.output.height, 0)) - - if text is not None and has_valid_segment: - # TODO sometimes drawn on wrong objects. the heuristics here can improve. - lighter_color = self._change_color_brightness(color, brightness_factor=0.7) - _num_cc, cc_labels, stats, centroids = cv2.connectedComponentsWithStats(binary_mask, 8) - largest_component_id = np.argmax(stats[1:, -1]) + 1 - - # draw text on the largest component, as well as other very large components. - for cid in range(1, _num_cc): - if cid == largest_component_id or stats[cid, -1] > _LARGE_MASK_AREA_THRESH: - # median is more stable than centroid - # center = centroids[largest_component_id] - center = np.median((cc_labels == cid).nonzero(), axis=1)[::-1] - self.draw_text(text, center, color=lighter_color) - return self.output - - def draw_polygon(self, segment, color, edge_color=None, alpha=0.5): - """ - Args: - segment: numpy array of shape Nx2, containing all the points in the polygon. - color: color of the polygon. Refer to `matplotlib.colors` for a full list of - formats that are accepted. - edge_color: color of the polygon edges. Refer to `matplotlib.colors` for a - full list of formats that are accepted. If not provided, a darker shade - of the polygon color will be used instead. - alpha (float): blending efficient. Smaller values lead to more transparent masks. - - Returns: - output (VisImage): image object with polygon drawn. - """ - if edge_color is None: - # make edge color darker than the polygon color - if alpha > 0.8: - edge_color = self._change_color_brightness(color, brightness_factor=-0.7) - else: - edge_color = color - edge_color = mplc.to_rgb(edge_color) + (1,) - - polygon = mpl.patches.Polygon( - segment, - fill=True, - facecolor=mplc.to_rgb(color) + (alpha,), - edgecolor=edge_color, - linewidth=max(self._default_font_size // 15 * self.output.scale, 1), - ) - self.output.ax.add_patch(polygon) - return self.output - - """ - Internal methods: - """ - - def _jitter(self, color): - """ - Randomly modifies given color to produce a slightly different color than the color given. - - Args: - color (tuple[double]): a tuple of 3 elements, containing the RGB values of the color - picked. The values in the list are in the [0.0, 1.0] range. - - Returns: - jittered_color (tuple[double]): a tuple of 3 elements, containing the RGB values of the - color after being jittered. The values in the list are in the [0.0, 1.0] range. - """ - color = mplc.to_rgb(color) - vec = np.random.rand(3) - # better to do it in another color space - vec = vec / np.linalg.norm(vec) * 0.5 - res = np.clip(vec + color, 0, 1) - return tuple(res) - - def _create_grayscale_image(self, mask=None): - """ - Create a grayscale version of the original image. - The colors in masked area, if given, will be kept. - """ - img_bw = self.img.astype("f4").mean(axis=2) - img_bw = np.stack([img_bw] * 3, axis=2) - if mask is not None: - img_bw[mask] = self.img[mask] - return img_bw - - def _change_color_brightness(self, color, brightness_factor): - """ - Depending on the brightness_factor, gives a lighter or darker color i.e. a color with - less or more saturation than the original color. - - Args: - color: color of the polygon. Refer to `matplotlib.colors` for a full list of - formats that are accepted. - brightness_factor (float): a value in [-1.0, 1.0] range. A lightness factor of - 0 will correspond to no change, a factor in [-1.0, 0) range will result in - a darker color and a factor in (0, 1.0] range will result in a lighter color. - - Returns: - modified_color (tuple[double]): a tuple containing the RGB values of the - modified color. Each value in the tuple is in the [0.0, 1.0] range. - """ - assert brightness_factor >= -1.0 and brightness_factor <= 1.0 - color = mplc.to_rgb(color) - polygon_color = colorsys.rgb_to_hls(*mplc.to_rgb(color)) - modified_lightness = polygon_color[1] + (brightness_factor * polygon_color[1]) - modified_lightness = 0.0 if modified_lightness < 0.0 else modified_lightness - modified_lightness = 1.0 if modified_lightness > 1.0 else modified_lightness - modified_color = colorsys.hls_to_rgb(polygon_color[0], modified_lightness, polygon_color[2]) - return modified_color - - def _convert_boxes(self, boxes): - """ - Convert different format of boxes to an NxB array, where B = 4 or 5 is the box dimension. - """ - if isinstance(boxes, Boxes) or isinstance(boxes, RotatedBoxes): - return boxes.tensor.detach().numpy() - else: - return np.asarray(boxes) - - def _convert_masks(self, masks_or_polygons): - """ - Convert different format of masks or polygons to a tuple of masks and polygons. - - Returns: - list[GenericMask]: - """ - - m = masks_or_polygons - if isinstance(m, PolygonMasks): - m = m.polygons - if isinstance(m, BitMasks): - m = m.tensor.numpy() - if isinstance(m, torch.Tensor): - m = m.numpy() - ret = [] - for x in m: - if isinstance(x, GenericMask): - ret.append(x) - else: - ret.append(GenericMask(x, self.output.height, self.output.width)) - return ret - - def _convert_keypoints(self, keypoints): - if isinstance(keypoints, Keypoints): - keypoints = keypoints.tensor - keypoints = np.asarray(keypoints) - return keypoints - - def get_output(self): - """ - Returns: - output (VisImage): the image output containing the visualizations added - to the image. - """ - return self.output diff --git a/magic_pdf/model/sub_modules/mfd/__init__.py b/magic_pdf/model/sub_modules/mfd/__init__.py deleted file mode 100644 index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000 diff --git a/magic_pdf/model/sub_modules/mfd/yolov8/YOLOv8.py b/magic_pdf/model/sub_modules/mfd/yolov8/YOLOv8.py deleted file mode 100644 index 23d230d00415997d71c14daf136779d32a02ee6b..0000000000000000000000000000000000000000 --- a/magic_pdf/model/sub_modules/mfd/yolov8/YOLOv8.py +++ /dev/null @@ -1,33 +0,0 @@ -from tqdm import tqdm -from ultralytics import YOLO - - -class YOLOv8MFDModel(object): - def __init__(self, weight, device="cpu"): - self.mfd_model = YOLO(weight) - self.device = device - - def predict(self, image): - mfd_res = self.mfd_model.predict( - image, imgsz=1888, conf=0.25, iou=0.45, verbose=False, device=self.device - )[0] - return mfd_res - - def batch_predict(self, images: list, batch_size: int) -> list: - images_mfd_res = [] - # for index in range(0, len(images), batch_size): - for index in tqdm(range(0, len(images), batch_size), desc="MFD Predict"): - mfd_res = [ - image_res.cpu() - for image_res in self.mfd_model.predict( - images[index : index + batch_size], - imgsz=1888, - conf=0.25, - iou=0.45, - verbose=False, - device=self.device, - ) - ] - for image_res in mfd_res: - images_mfd_res.append(image_res) - return images_mfd_res diff --git a/magic_pdf/model/sub_modules/mfd/yolov8/__init__.py b/magic_pdf/model/sub_modules/mfd/yolov8/__init__.py deleted file mode 100644 index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000 diff --git a/magic_pdf/model/sub_modules/mfr/__init__.py b/magic_pdf/model/sub_modules/mfr/__init__.py deleted file mode 100644 index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000 diff --git a/magic_pdf/model/sub_modules/mfr/unimernet/Unimernet.py b/magic_pdf/model/sub_modules/mfr/unimernet/Unimernet.py deleted file mode 100644 index 6c3a1e1d8a58d5e0f4e178875803df85c123e0d0..0000000000000000000000000000000000000000 --- a/magic_pdf/model/sub_modules/mfr/unimernet/Unimernet.py +++ /dev/null @@ -1,135 +0,0 @@ -import torch -from torch.utils.data import DataLoader, Dataset -from tqdm import tqdm - - -class MathDataset(Dataset): - def __init__(self, image_paths, transform=None): - self.image_paths = image_paths - self.transform = transform - - def __len__(self): - return len(self.image_paths) - - def __getitem__(self, idx): - raw_image = self.image_paths[idx] - if self.transform: - image = self.transform(raw_image) - return image - - -class UnimernetModel(object): - def __init__(self, weight_dir, cfg_path, _device_="cpu"): - from .unimernet_hf import UnimernetModel - if _device_.startswith("mps"): - self.model = UnimernetModel.from_pretrained(weight_dir, attn_implementation="eager") - else: - self.model = UnimernetModel.from_pretrained(weight_dir) - self.device = _device_ - self.model.to(_device_) - if not _device_.startswith("cpu"): - self.model = self.model.to(dtype=torch.float16) - self.model.eval() - - def predict(self, mfd_res, image): - formula_list = [] - mf_image_list = [] - for xyxy, conf, cla in zip( - mfd_res.boxes.xyxy.cpu(), mfd_res.boxes.conf.cpu(), mfd_res.boxes.cls.cpu() - ): - xmin, ymin, xmax, ymax = [int(p.item()) for p in xyxy] - new_item = { - "category_id": 13 + int(cla.item()), - "poly": [xmin, ymin, xmax, ymin, xmax, ymax, xmin, ymax], - "score": round(float(conf.item()), 2), - "latex": "", - } - formula_list.append(new_item) - bbox_img = image[ymin:ymax, xmin:xmax] - mf_image_list.append(bbox_img) - - dataset = MathDataset(mf_image_list, transform=self.model.transform) - dataloader = DataLoader(dataset, batch_size=32, num_workers=0) - mfr_res = [] - for mf_img in dataloader: - mf_img = mf_img.to(dtype=self.model.dtype) - mf_img = mf_img.to(self.device) - with torch.no_grad(): - output = self.model.generate({"image": mf_img}) - mfr_res.extend(output["fixed_str"]) - for res, latex in zip(formula_list, mfr_res): - res["latex"] = latex - return formula_list - - def batch_predict(self, images_mfd_res: list, images: list, batch_size: int = 64) -> list: - images_formula_list = [] - mf_image_list = [] - backfill_list = [] - image_info = [] # Store (area, original_index, image) tuples - - # Collect images with their original indices - for image_index in range(len(images_mfd_res)): - mfd_res = images_mfd_res[image_index] - np_array_image = images[image_index] - formula_list = [] - - for idx, (xyxy, conf, cla) in enumerate(zip( - mfd_res.boxes.xyxy, mfd_res.boxes.conf, mfd_res.boxes.cls - )): - xmin, ymin, xmax, ymax = [int(p.item()) for p in xyxy] - new_item = { - "category_id": 13 + int(cla.item()), - "poly": [xmin, ymin, xmax, ymin, xmax, ymax, xmin, ymax], - "score": round(float(conf.item()), 2), - "latex": "", - } - formula_list.append(new_item) - bbox_img = np_array_image[ymin:ymax, xmin:xmax] - area = (xmax - xmin) * (ymax - ymin) - - curr_idx = len(mf_image_list) - image_info.append((area, curr_idx, bbox_img)) - mf_image_list.append(bbox_img) - - images_formula_list.append(formula_list) - backfill_list += formula_list - - # Stable sort by area - image_info.sort(key=lambda x: x[0]) # sort by area - sorted_indices = [x[1] for x in image_info] - sorted_images = [x[2] for x in image_info] - - # Create mapping for results - index_mapping = {new_idx: old_idx for new_idx, old_idx in enumerate(sorted_indices)} - - # Create dataset with sorted images - dataset = MathDataset(sorted_images, transform=self.model.transform) - dataloader = DataLoader(dataset, batch_size=batch_size, num_workers=0) - - # Process batches and store results - mfr_res = [] - # for mf_img in dataloader: - - with tqdm(total=len(sorted_images), desc="MFR Predict") as pbar: - for index, mf_img in enumerate(dataloader): - mf_img = mf_img.to(dtype=self.model.dtype) - mf_img = mf_img.to(self.device) - with torch.no_grad(): - output = self.model.generate({"image": mf_img}) - mfr_res.extend(output["fixed_str"]) - - # 更新进度条,每次增加batch_size,但要注意最后一个batch可能不足batch_size - current_batch_size = min(batch_size, len(sorted_images) - index * batch_size) - pbar.update(current_batch_size) - - # Restore original order - unsorted_results = [""] * len(mfr_res) - for new_idx, latex in enumerate(mfr_res): - original_idx = index_mapping[new_idx] - unsorted_results[original_idx] = latex - - # Fill results back - for res, latex in zip(backfill_list, unsorted_results): - res["latex"] = latex - - return images_formula_list diff --git a/magic_pdf/model/sub_modules/mfr/unimernet/__init__.py b/magic_pdf/model/sub_modules/mfr/unimernet/__init__.py deleted file mode 100644 index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000 diff --git a/magic_pdf/model/sub_modules/mfr/unimernet/unimernet_hf/__init__.py b/magic_pdf/model/sub_modules/mfr/unimernet/unimernet_hf/__init__.py deleted file mode 100644 index 772dcfa32813a2f7befe217ee5addd3e4e6ee28a..0000000000000000000000000000000000000000 --- a/magic_pdf/model/sub_modules/mfr/unimernet/unimernet_hf/__init__.py +++ /dev/null @@ -1,13 +0,0 @@ -from .unimer_swin import UnimerSwinConfig, UnimerSwinModel, UnimerSwinImageProcessor -from .unimer_mbart import UnimerMBartConfig, UnimerMBartModel, UnimerMBartForCausalLM -from .modeling_unimernet import UnimernetModel - -__all__ = [ - "UnimerSwinConfig", - "UnimerSwinModel", - "UnimerSwinImageProcessor", - "UnimerMBartConfig", - "UnimerMBartModel", - "UnimerMBartForCausalLM", - "UnimernetModel", -] diff --git a/magic_pdf/model/sub_modules/mfr/unimernet/unimernet_hf/modeling_unimernet.py b/magic_pdf/model/sub_modules/mfr/unimernet/unimernet_hf/modeling_unimernet.py deleted file mode 100644 index a4a9bbb931b5dc12786babe3731d00586879de46..0000000000000000000000000000000000000000 --- a/magic_pdf/model/sub_modules/mfr/unimernet/unimernet_hf/modeling_unimernet.py +++ /dev/null @@ -1,490 +0,0 @@ -import os -import re -import warnings -from typing import Optional - -import torch -from ftfy import fix_text -from loguru import logger - -from transformers import AutoConfig, AutoModel, AutoModelForCausalLM, AutoTokenizer, PretrainedConfig, PreTrainedModel -from transformers import VisionEncoderDecoderConfig, VisionEncoderDecoderModel -from transformers.models.vision_encoder_decoder.modeling_vision_encoder_decoder import logger as base_model_logger - -from .unimer_swin import UnimerSwinConfig, UnimerSwinModel, UnimerSwinImageProcessor -from .unimer_mbart import UnimerMBartConfig, UnimerMBartForCausalLM - -AutoConfig.register(UnimerSwinConfig.model_type, UnimerSwinConfig) -AutoConfig.register(UnimerMBartConfig.model_type, UnimerMBartConfig) -AutoModel.register(UnimerSwinConfig, UnimerSwinModel) -AutoModelForCausalLM.register(UnimerMBartConfig, UnimerMBartForCausalLM) - - -# TODO: rewrite tokenizer -class TokenizerWrapper: - def __init__(self, tokenizer): - self.tokenizer = tokenizer - self.pad_token_id = self.tokenizer.pad_token_id - self.bos_token_id = self.tokenizer.bos_token_id - self.eos_token_id = self.tokenizer.eos_token_id - - def __len__(self): - return len(self.tokenizer) - - def tokenize(self, text, **kwargs): - return self.tokenizer( - text, - return_token_type_ids=False, - return_tensors="pt", - padding="longest", - truncation=True, - **kwargs, - ) - - def token2str(self, tokens) -> list: - generated_text = self.tokenizer.batch_decode(tokens, skip_special_tokens=True) - generated_text = [fix_text(text) for text in generated_text] - return generated_text - - def detokenize(self, tokens): - toks = [self.tokenizer.convert_ids_to_tokens(tok) for tok in tokens] - for b in range(len(toks)): - for i in reversed(range(len(toks[b]))): - if toks[b][i] is None: - toks[b][i] = '' - toks[b][i] = toks[b][i].replace('Ġ', ' ').strip() - if toks[b][i] in ([self.tokenizer.bos_token, self.tokenizer.eos_token, self.tokenizer.pad_token]): - del toks[b][i] - return toks - - -LEFT_PATTERN = re.compile(r'(\\left)(\S*)') -RIGHT_PATTERN = re.compile(r'(\\right)(\S*)') -LEFT_COUNT_PATTERN = re.compile(r'\\left(?![a-zA-Z])') -RIGHT_COUNT_PATTERN = re.compile(r'\\right(?![a-zA-Z])') -LEFT_RIGHT_REMOVE_PATTERN = re.compile(r'\\left\.?|\\right\.?') - -def fix_latex_left_right(s): - """ - 修复LaTeX中的\\left和\\right命令 - 1. 确保它们后面跟有效分隔符 - 2. 平衡\\left和\\right的数量 - """ - # 白名单分隔符 - valid_delims_list = [r'(', r')', r'[', r']', r'{', r'}', r'/', r'|', - r'\{', r'\}', r'\lceil', r'\rceil', r'\lfloor', - r'\rfloor', r'\backslash', r'\uparrow', r'\downarrow', - r'\Uparrow', r'\Downarrow', r'\|', r'\.'] - - # 为\left后缺失有效分隔符的情况添加点 - def fix_delim(match, is_left=True): - cmd = match.group(1) # \left 或 \right - rest = match.group(2) if len(match.groups()) > 1 else "" - if not rest or rest not in valid_delims_list: - return cmd + "." - return match.group(0) - - # 使用更精确的模式匹配\left和\right命令 - # 确保它们是独立的命令,不是其他命令的一部分 - # 使用预编译正则和统一回调函数 - s = LEFT_PATTERN.sub(lambda m: fix_delim(m, True), s) - s = RIGHT_PATTERN.sub(lambda m: fix_delim(m, False), s) - - # 更精确地计算\left和\right的数量 - left_count = len(LEFT_COUNT_PATTERN.findall(s)) # 不匹配\lefteqn等 - right_count = len(RIGHT_COUNT_PATTERN.findall(s)) # 不匹配\rightarrow等 - - if left_count == right_count: - # 如果数量相等,检查是否在同一组 - return fix_left_right_pairs(s) - else: - # 如果数量不等,移除所有\left和\right - # logger.debug(f"latex:{s}") - # logger.warning(f"left_count: {left_count}, right_count: {right_count}") - return LEFT_RIGHT_REMOVE_PATTERN.sub('', s) - - -def fix_left_right_pairs(latex_formula): - """ - 检测并修复LaTeX公式中\\left和\\right不在同一组的情况 - - Args: - latex_formula (str): 输入的LaTeX公式 - - Returns: - str: 修复后的LaTeX公式 - """ - # 用于跟踪花括号嵌套层级 - brace_stack = [] - # 用于存储\left信息: (位置, 深度, 分隔符) - left_stack = [] - # 存储需要调整的\right信息: (开始位置, 结束位置, 目标位置) - adjustments = [] - - i = 0 - while i < len(latex_formula): - # 检查是否是转义字符 - if i > 0 and latex_formula[i - 1] == '\\': - backslash_count = 0 - j = i - 1 - while j >= 0 and latex_formula[j] == '\\': - backslash_count += 1 - j -= 1 - - if backslash_count % 2 == 1: - i += 1 - continue - - # 检测\left命令 - if i + 5 < len(latex_formula) and latex_formula[i:i + 5] == "\\left" and i + 5 < len(latex_formula): - delimiter = latex_formula[i + 5] - left_stack.append((i, len(brace_stack), delimiter)) - i += 6 # 跳过\left和分隔符 - continue - - # 检测\right命令 - elif i + 6 < len(latex_formula) and latex_formula[i:i + 6] == "\\right" and i + 6 < len(latex_formula): - delimiter = latex_formula[i + 6] - - if left_stack: - left_pos, left_depth, left_delim = left_stack.pop() - - # 如果\left和\right不在同一花括号深度 - if left_depth != len(brace_stack): - # 找到\left所在花括号组的结束位置 - target_pos = find_group_end(latex_formula, left_pos, left_depth) - if target_pos != -1: - # 记录需要移动的\right - adjustments.append((i, i + 7, target_pos)) - - i += 7 # 跳过\right和分隔符 - continue - - # 处理花括号 - if latex_formula[i] == '{': - brace_stack.append(i) - elif latex_formula[i] == '}': - if brace_stack: - brace_stack.pop() - - i += 1 - - # 应用调整,从后向前处理以避免索引变化 - if not adjustments: - return latex_formula - - result = list(latex_formula) - adjustments.sort(reverse=True, key=lambda x: x[0]) - - for start, end, target in adjustments: - # 提取\right部分 - right_part = result[start:end] - # 从原位置删除 - del result[start:end] - # 在目标位置插入 - result.insert(target, ''.join(right_part)) - - return ''.join(result) - - -def find_group_end(text, pos, depth): - """查找特定深度的花括号组的结束位置""" - current_depth = depth - i = pos - - while i < len(text): - if text[i] == '{' and (i == 0 or not is_escaped(text, i)): - current_depth += 1 - elif text[i] == '}' and (i == 0 or not is_escaped(text, i)): - current_depth -= 1 - if current_depth < depth: - return i - i += 1 - - return -1 # 未找到对应结束位置 - - -def is_escaped(text, pos): - """检查字符是否被转义""" - backslash_count = 0 - j = pos - 1 - while j >= 0 and text[j] == '\\': - backslash_count += 1 - j -= 1 - - return backslash_count % 2 == 1 - - -def fix_unbalanced_braces(latex_formula): - """ - 检测LaTeX公式中的花括号是否闭合,并删除无法配对的花括号 - - Args: - latex_formula (str): 输入的LaTeX公式 - - Returns: - str: 删除无法配对的花括号后的LaTeX公式 - """ - stack = [] # 存储左括号的索引 - unmatched = set() # 存储不匹配括号的索引 - i = 0 - - while i < len(latex_formula): - # 检查是否是转义的花括号 - if latex_formula[i] in ['{', '}']: - # 计算前面连续的反斜杠数量 - backslash_count = 0 - j = i - 1 - while j >= 0 and latex_formula[j] == '\\': - backslash_count += 1 - j -= 1 - - # 如果前面有奇数个反斜杠,则该花括号是转义的,不参与匹配 - if backslash_count % 2 == 1: - i += 1 - continue - - # 否则,该花括号参与匹配 - if latex_formula[i] == '{': - stack.append(i) - else: # latex_formula[i] == '}' - if stack: # 有对应的左括号 - stack.pop() - else: # 没有对应的左括号 - unmatched.add(i) - - i += 1 - - # 所有未匹配的左括号 - unmatched.update(stack) - - # 构建新字符串,删除不匹配的括号 - return ''.join(char for i, char in enumerate(latex_formula) if i not in unmatched) - - -def process_latex(input_string): - """ - 处理LaTeX公式中的反斜杠: - 1. 如果\后跟特殊字符(#$%&~_^\\{})或空格,保持不变 - 2. 如果\后跟两个小写字母,保持不变 - 3. 其他情况,在\后添加空格 - - Args: - input_string (str): 输入的LaTeX公式 - - Returns: - str: 处理后的LaTeX公式 - """ - - def replace_func(match): - # 获取\后面的字符 - next_char = match.group(1) - - # 如果是特殊字符或空格,保持不变 - if next_char in "#$%&~_^|\\{} \t\n\r\v\f": - return match.group(0) - - # 如果是字母,检查下一个字符 - if 'a' <= next_char <= 'z' or 'A' <= next_char <= 'Z': - pos = match.start() + 2 # \x后的位置 - if pos < len(input_string) and ('a' <= input_string[pos] <= 'z' or 'A' <= input_string[pos] <= 'Z'): - # 下一个字符也是字母,保持不变 - return match.group(0) - - # 其他情况,在\后添加空格 - return '\\' + ' ' + next_char - - # 匹配\后面跟一个字符的情况 - pattern = r'\\(.)' - - return re.sub(pattern, replace_func, input_string) - -# 常见的在KaTeX/MathJax中可用的数学环境 -ENV_TYPES = ['array', 'matrix', 'pmatrix', 'bmatrix', 'vmatrix', - 'Bmatrix', 'Vmatrix', 'cases', 'aligned', 'gathered'] -ENV_BEGIN_PATTERNS = {env: re.compile(r'\\begin\{' + env + r'\}') for env in ENV_TYPES} -ENV_END_PATTERNS = {env: re.compile(r'\\end\{' + env + r'\}') for env in ENV_TYPES} -ENV_FORMAT_PATTERNS = {env: re.compile(r'\\begin\{' + env + r'\}\{([^}]*)\}') for env in ENV_TYPES} - -def fix_latex_environments(s): - """ - 检测LaTeX中环境(如array)的\\begin和\\end是否匹配 - 1. 如果缺少\\begin标签则在开头添加 - 2. 如果缺少\\end标签则在末尾添加 - """ - for env in ENV_TYPES: - begin_count = len(ENV_BEGIN_PATTERNS[env].findall(s)) - end_count = len(ENV_END_PATTERNS[env].findall(s)) - - if begin_count != end_count: - if end_count > begin_count: - format_match = ENV_FORMAT_PATTERNS[env].search(s) - default_format = '{c}' if env == 'array' else '' - format_str = '{' + format_match.group(1) + '}' if format_match else default_format - - missing_count = end_count - begin_count - begin_command = '\\begin{' + env + '}' + format_str + ' ' - s = begin_command * missing_count + s - else: - missing_count = begin_count - end_count - s = s + (' \\end{' + env + '}') * missing_count - - return s - - -UP_PATTERN = re.compile(r'\\up([a-zA-Z]+)') -COMMANDS_TO_REMOVE_PATTERN = re.compile( - r'\\(?:lefteqn|boldmath|ensuremath|centering|textsubscript|sides|textsl|textcent|emph|protect|null)') -REPLACEMENTS_PATTERNS = { - re.compile(r'\\underbar'): r'\\underline', - re.compile(r'\\Bar'): r'\\hat', - re.compile(r'\\Hat'): r'\\hat', - re.compile(r'\\Tilde'): r'\\tilde', - re.compile(r'\\slash'): r'/', - re.compile(r'\\textperthousand'): r'‰', - re.compile(r'\\sun'): r'☉', - re.compile(r'\\textunderscore'): r'\\_', - re.compile(r'\\fint'): r'⨏', - re.compile(r'\\up '): r'\\ ', - re.compile(r'\\vline = '): r'\\models ', - re.compile(r'\\vDash '): r'\\models ', - re.compile(r'\\sq \\sqcup '): r'\\square ', -} -QQUAD_PATTERN = re.compile(r'\\qquad(?!\s)') - -def latex_rm_whitespace(s: str): - """Remove unnecessary whitespace from LaTeX code.""" - s = fix_unbalanced_braces(s) - s = fix_latex_left_right(s) - s = fix_latex_environments(s) - - # 使用预编译的正则表达式 - s = UP_PATTERN.sub( - lambda m: m.group(0) if m.group(1) in ["arrow", "downarrow", "lus", "silon"] else f"\\{m.group(1)}", s - ) - s = COMMANDS_TO_REMOVE_PATTERN.sub('', s) - - # 应用所有替换 - for pattern, replacement in REPLACEMENTS_PATTERNS.items(): - s = pattern.sub(replacement, s) - - # 处理LaTeX中的反斜杠和空格 - s = process_latex(s) - - # \qquad后补空格 - s = QQUAD_PATTERN.sub(r'\\qquad ', s) - - return s - - -class UnimernetModel(VisionEncoderDecoderModel): - def __init__( - self, - config: Optional[PretrainedConfig] = None, - encoder: Optional[PreTrainedModel] = None, - decoder: Optional[PreTrainedModel] = None, - ): - # VisionEncoderDecoderModel's checking log has bug, disable for temp. - base_model_logger.disabled = True - try: - super().__init__(config, encoder, decoder) - finally: - base_model_logger.disabled = False - - if not config or not hasattr(config, "_name_or_path"): - raise RuntimeError("config._name_or_path is required by UnimernetModel.") - - model_path = config._name_or_path - self.transform = UnimerSwinImageProcessor() - self.tokenizer = TokenizerWrapper(AutoTokenizer.from_pretrained(model_path)) - self._post_check() - - def _post_check(self): - tokenizer = self.tokenizer - - if tokenizer.tokenizer.model_max_length != self.config.decoder.max_position_embeddings: - warnings.warn( - f"decoder.max_position_embeddings={self.config.decoder.max_position_embeddings}," + - f" but tokenizer.model_max_length={tokenizer.tokenizer.model_max_length}, will set" + - f" tokenizer.model_max_length to {self.config.decoder.max_position_embeddings}.") - tokenizer.tokenizer.model_max_length = self.config.decoder.max_position_embeddings - - assert self.config.decoder.vocab_size == len(tokenizer) - assert self.config.decoder_start_token_id == tokenizer.bos_token_id - assert self.config.pad_token_id == tokenizer.pad_token_id - - @classmethod - def from_checkpoint(cls, model_path: str, model_filename: str = "pytorch_model.pth", state_dict_strip_prefix="model.model."): - config = VisionEncoderDecoderConfig.from_pretrained(model_path) - config._name_or_path = model_path - config.encoder = UnimerSwinConfig(**vars(config.encoder)) - config.decoder = UnimerMBartConfig(**vars(config.decoder)) - - encoder = UnimerSwinModel(config.encoder) - decoder = UnimerMBartForCausalLM(config.decoder) - model = cls(config, encoder, decoder) - - # load model weights - model_file_path = os.path.join(model_path, model_filename) - checkpoint = torch.load(model_file_path, map_location="cpu", weights_only=True) - state_dict = checkpoint["model"] if "model" in checkpoint else checkpoint - if not state_dict: - raise RuntimeError("state_dict is empty.") - if state_dict_strip_prefix: - state_dict = { - k[len(state_dict_strip_prefix):] if k.startswith(state_dict_strip_prefix) else k: v - for k, v in state_dict.items() - } - missing_keys, unexpected_keys = model.load_state_dict(state_dict, strict=False) - if len(unexpected_keys) > 0: - warnings.warn("Unexpected key(s) in state_dict: {}.".format(", ".join(f'"{k}"' for k in unexpected_keys))) - if len(missing_keys) > 0: - raise RuntimeError("Missing key(s) in state_dict: {}.".format(", ".join(f'"{k}"' for k in missing_keys))) - return model - - def forward_bak(self, samples): - pixel_values, text = samples["image"], samples["text_input"] - - text_inputs = self.tokenizer.tokenize(text).to(pixel_values.device) - decoder_input_ids, decoder_attention_mask = text_inputs["input_ids"], text_inputs["attention_mask"] - - num_channels = pixel_values.shape[1] - if num_channels == 1: - pixel_values = pixel_values.repeat(1, 3, 1, 1) - - labels = decoder_input_ids * 1 - labels = labels.masked_fill(labels == self.tokenizer.pad_token_id, -100) - - loss = self.model( - pixel_values=pixel_values, - decoder_input_ids=decoder_input_ids[:, :-1], - decoder_attention_mask=decoder_attention_mask[:, :-1], - labels=labels[:, 1:], - ).loss - return {"loss": loss} - - def generate(self, samples, do_sample: bool = False, temperature: float = 0.2, top_p: float = 0.95): - pixel_values = samples["image"] - num_channels = pixel_values.shape[1] - if num_channels == 1: - pixel_values = pixel_values.repeat(1, 3, 1, 1) - - kwargs = {} - if do_sample: - kwargs["temperature"] = temperature - kwargs["top_p"] = top_p - - outputs = super().generate( - pixel_values=pixel_values, - max_new_tokens=self.tokenizer.tokenizer.model_max_length, # required - decoder_start_token_id=self.tokenizer.tokenizer.bos_token_id, - do_sample=do_sample, - **kwargs, - ) - - outputs = outputs[:, 1:].cpu().numpy() - pred_tokens = self.tokenizer.detokenize(outputs) - pred_str = self.tokenizer.token2str(outputs) - fixed_str = [latex_rm_whitespace(s) for s in pred_str] - return {"pred_ids": outputs, "pred_tokens": pred_tokens, "pred_str": pred_str, "fixed_str": fixed_str} - diff --git a/magic_pdf/model/sub_modules/mfr/unimernet/unimernet_hf/unimer_mbart/__init__.py b/magic_pdf/model/sub_modules/mfr/unimernet/unimernet_hf/unimer_mbart/__init__.py deleted file mode 100644 index 155a786bf087fad4c9707dd6a38d8f3a252937b3..0000000000000000000000000000000000000000 --- a/magic_pdf/model/sub_modules/mfr/unimernet/unimernet_hf/unimer_mbart/__init__.py +++ /dev/null @@ -1,8 +0,0 @@ -from .configuration_unimer_mbart import UnimerMBartConfig -from .modeling_unimer_mbart import UnimerMBartModel, UnimerMBartForCausalLM - -__all__ = [ - "UnimerMBartConfig", - "UnimerMBartModel", - "UnimerMBartForCausalLM", -] diff --git a/magic_pdf/model/sub_modules/mfr/unimernet/unimernet_hf/unimer_mbart/configuration_unimer_mbart.py b/magic_pdf/model/sub_modules/mfr/unimernet/unimernet_hf/unimer_mbart/configuration_unimer_mbart.py deleted file mode 100644 index eef4a57d069104e0f45b93c02454e82e41d084c4..0000000000000000000000000000000000000000 --- a/magic_pdf/model/sub_modules/mfr/unimernet/unimernet_hf/unimer_mbart/configuration_unimer_mbart.py +++ /dev/null @@ -1,163 +0,0 @@ -# coding=utf-8 -# Copyright 2021, The Facebook AI Research Team and The HuggingFace Inc. team. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""UnimerMBART model configuration""" - -from transformers.configuration_utils import PretrainedConfig -from transformers.utils import logging - - -logger = logging.get_logger(__name__) - - -class UnimerMBartConfig(PretrainedConfig): - r""" - This is the configuration class to store the configuration of a [`MBartModel`]. It is used to instantiate an MBART - model according to the specified arguments, defining the model architecture. Instantiating a configuration with the - defaults will yield a similar configuration to that of the MBART - [facebook/mbart-large-cc25](https://huggingface.co/facebook/mbart-large-cc25) architecture. - - Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the - documentation from [`PretrainedConfig`] for more information. - - - Args: - vocab_size (`int`, *optional*, defaults to 50265): - Vocabulary size of the MBART model. Defines the number of different tokens that can be represented by the - `inputs_ids` passed when calling [`MBartModel`] or [`TFMBartModel`]. - d_model (`int`, *optional*, defaults to 1024): - Dimensionality of the layers and the pooler layer. - qk_squeeze (`int`, *optional*, defaults to 2): - Squeeze ratio for query/key's output dimension. See the [UniMERNet paper](https://arxiv.org/abs/2404.15254). - Squeeze Attention maps the query and key to a lower-dimensional space without excessive loss of information, - thereby accelerating the computation of attention. - encoder_layers (`int`, *optional*, defaults to 12): - Number of encoder layers. - decoder_layers (`int`, *optional*, defaults to 12): - Number of decoder layers. - encoder_attention_heads (`int`, *optional*, defaults to 16): - Number of attention heads for each attention layer in the Transformer encoder. - decoder_attention_heads (`int`, *optional*, defaults to 16): - Number of attention heads for each attention layer in the Transformer decoder. - decoder_ffn_dim (`int`, *optional*, defaults to 4096): - Dimensionality of the "intermediate" (often named feed-forward) layer in decoder. - encoder_ffn_dim (`int`, *optional*, defaults to 4096): - Dimensionality of the "intermediate" (often named feed-forward) layer in decoder. - activation_function (`str` or `function`, *optional*, defaults to `"gelu"`): - The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`, - `"relu"`, `"silu"` and `"gelu_new"` are supported. - dropout (`float`, *optional*, defaults to 0.1): - The dropout probability for all fully connected layers in the embeddings, encoder, and pooler. - attention_dropout (`float`, *optional*, defaults to 0.0): - The dropout ratio for the attention probabilities. - activation_dropout (`float`, *optional*, defaults to 0.0): - The dropout ratio for activations inside the fully connected layer. - classifier_dropout (`float`, *optional*, defaults to 0.0): - The dropout ratio for classifier. - max_position_embeddings (`int`, *optional*, defaults to 1024): - The maximum sequence length that this model might ever be used with. Typically set this to something large - just in case (e.g., 512 or 1024 or 2048). - init_std (`float`, *optional*, defaults to 0.02): - The standard deviation of the truncated_normal_initializer for initializing all weight matrices. - encoder_layerdrop (`float`, *optional*, defaults to 0.0): - The LayerDrop probability for the encoder. See the [LayerDrop paper](see https://arxiv.org/abs/1909.11556) - for more details. - decoder_layerdrop (`float`, *optional*, defaults to 0.0): - The LayerDrop probability for the decoder. See the [LayerDrop paper](see https://arxiv.org/abs/1909.11556) - for more details. - scale_embedding (`bool`, *optional*, defaults to `False`): - Scale embeddings by diving by sqrt(d_model). - use_cache (`bool`, *optional*, defaults to `True`): - Whether or not the model should return the last key/values attentions (not used by all models) - forced_eos_token_id (`int`, *optional*, defaults to 2): - The id of the token to force as the last generated token when `max_length` is reached. Usually set to - `eos_token_id`. - - Example: - - ```python - >>> from transformers import MBartConfig, MBartModel - - >>> # Initializing a MBART facebook/mbart-large-cc25 style configuration - >>> configuration = MBartConfig() - - >>> # Initializing a model (with random weights) from the facebook/mbart-large-cc25 style configuration - >>> model = MBartModel(configuration) - - >>> # Accessing the model configuration - >>> configuration = model.config - ```""" - - model_type = "unimer-mbart" - keys_to_ignore_at_inference = ["past_key_values"] - attribute_map = {"num_attention_heads": "encoder_attention_heads", "hidden_size": "d_model"} - - def __init__( - self, - vocab_size=50265, - max_position_embeddings=1024, - encoder_layers=12, - encoder_ffn_dim=4096, - encoder_attention_heads=16, - decoder_layers=12, - decoder_ffn_dim=4096, - decoder_attention_heads=16, - encoder_layerdrop=0.0, - decoder_layerdrop=0.0, - use_cache=True, - is_encoder_decoder=True, - activation_function="gelu", - d_model=1024, - qk_squeeze=2, - dropout=0.1, - attention_dropout=0.0, - activation_dropout=0.0, - init_std=0.02, - classifier_dropout=0.0, - scale_embedding=False, - pad_token_id=1, - bos_token_id=0, - eos_token_id=2, - forced_eos_token_id=2, - **kwargs, - ): - self.vocab_size = vocab_size - self.max_position_embeddings = max_position_embeddings - self.d_model = d_model - self.qk_squeeze = qk_squeeze - self.encoder_ffn_dim = encoder_ffn_dim - self.encoder_layers = encoder_layers - self.encoder_attention_heads = encoder_attention_heads - self.decoder_ffn_dim = decoder_ffn_dim - self.decoder_layers = decoder_layers - self.decoder_attention_heads = decoder_attention_heads - self.dropout = dropout - self.attention_dropout = attention_dropout - self.activation_dropout = activation_dropout - self.activation_function = activation_function - self.init_std = init_std - self.encoder_layerdrop = encoder_layerdrop - self.decoder_layerdrop = decoder_layerdrop - self.classifier_dropout = classifier_dropout - self.use_cache = use_cache - self.num_hidden_layers = encoder_layers - self.scale_embedding = scale_embedding # scale factor will be sqrt(d_model) if True - super().__init__( - pad_token_id=pad_token_id, - bos_token_id=bos_token_id, - eos_token_id=eos_token_id, - is_encoder_decoder=is_encoder_decoder, - forced_eos_token_id=forced_eos_token_id, - **kwargs, - ) diff --git a/magic_pdf/model/sub_modules/mfr/unimernet/unimernet_hf/unimer_mbart/modeling_unimer_mbart.py b/magic_pdf/model/sub_modules/mfr/unimernet/unimernet_hf/unimer_mbart/modeling_unimer_mbart.py deleted file mode 100644 index 08a5a049a2bad5b1ea3c61edc7d6aac01c4f95a1..0000000000000000000000000000000000000000 --- a/magic_pdf/model/sub_modules/mfr/unimernet/unimernet_hf/unimer_mbart/modeling_unimer_mbart.py +++ /dev/null @@ -1,2351 +0,0 @@ -# coding=utf-8 -# Copyright 2021, The Facebook AI Research Team and The HuggingFace Inc. team. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""PyTorch UnimerMBART model.""" - -import copy -import math -from dataclasses import dataclass -from typing import List, Optional, Tuple, Union - -import torch -import torch.nn.functional as F -import torch.utils.checkpoint -from torch import nn -from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss - -from transformers.activations import ACT2FN -from transformers.modeling_attn_mask_utils import ( - _prepare_4d_attention_mask, - _prepare_4d_attention_mask_for_sdpa, - _prepare_4d_causal_attention_mask, - _prepare_4d_causal_attention_mask_for_sdpa, -) -from transformers.modeling_outputs import ( - BaseModelOutput, - BaseModelOutputWithPastAndCrossAttentions, - CausalLMOutputWithCrossAttentions, - Seq2SeqLMOutput, - Seq2SeqModelOutput, - Seq2SeqQuestionAnsweringModelOutput, - Seq2SeqSequenceClassifierOutput, -) -from transformers import GenerationMixin, PreTrainedModel -from transformers.utils import ( - add_code_sample_docstrings, - add_end_docstrings, - add_start_docstrings, - add_start_docstrings_to_model_forward, - is_flash_attn_2_available, - is_flash_attn_greater_or_equal_2_10, - logging, - replace_return_docstrings, -) -from .configuration_unimer_mbart import UnimerMBartConfig - - -if is_flash_attn_2_available(): - from flash_attn import flash_attn_func, flash_attn_varlen_func - from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input # noqa - - -logger = logging.get_logger(__name__) - -_CHECKPOINT_FOR_DOC = "facebook/mbart-large-cc25" -_CONFIG_FOR_DOC = "MBartConfig" - -# Base model docstring -_EXPECTED_OUTPUT_SHAPE = [1, 8, 1024] - - -# Copied from transformers.models.llama.modeling_llama._get_unpad_data -def _get_unpad_data(attention_mask): - seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32) - indices = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten() - max_seqlen_in_batch = seqlens_in_batch.max().item() - cu_seqlens = F.pad(torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.int32), (1, 0)) - return ( - indices, - cu_seqlens, - max_seqlen_in_batch, - ) - - -def shift_tokens_right(input_ids: torch.Tensor, pad_token_id: int): - """ - Shift input ids one token to the right, and wrap the last non pad token (the token) Note that MBart does not - have a single `decoder_start_token_id` in contrast to other Bart-like models. - """ - prev_output_tokens = input_ids.clone() - - if pad_token_id is None: - raise ValueError("self.model.config.pad_token_id has to be defined.") - # replace possible -100 values in labels by `pad_token_id` - prev_output_tokens.masked_fill_(prev_output_tokens == -100, pad_token_id) - - index_of_eos = (prev_output_tokens.ne(pad_token_id).sum(dim=1) - 1).unsqueeze(-1) - decoder_start_tokens = prev_output_tokens.gather(1, index_of_eos).squeeze() - prev_output_tokens[:, 1:] = prev_output_tokens[:, :-1].clone() - prev_output_tokens[:, 0] = decoder_start_tokens - - return prev_output_tokens - -@dataclass -class CausalLMOutputWithCrossAttentionsAndCounting(CausalLMOutputWithCrossAttentions): - """ - Base class for causal language model (or autoregressive) outputs. - - Args: - loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided): - Language modeling loss (for next-token prediction). - logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`): - Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). - hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): - Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, + - one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`. - - Hidden-states of the model at the output of each layer plus the optional initial embedding outputs. - attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): - Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, - sequence_length)`. - - Attentions weights after the attention softmax, used to compute the weighted average in the self-attention - heads. - cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): - Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, - sequence_length)`. - - Cross attentions weights after the attention softmax, used to compute the weighted average in the - cross-attention heads. - past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`): - Tuple of `torch.FloatTensor` tuples of length `config.n_layers`, with each tuple containing the cached key, - value states of the self-attention and the cross-attention layers if model is used in encoder-decoder - setting. Only relevant if `config.is_decoder = True`. - - Contains pre-computed hidden-states (key and values in the attention blocks) that can be used (see - `past_key_values` input) to speed up sequential decoding. - counting: - Counting - """ - counting: Optional[torch.FloatTensor] = None - -# Copied from transformers.models.bart.modeling_bart.BartLearnedPositionalEmbedding with Bart->MBart -class UnimerMBartLearnedPositionalEmbedding(nn.Embedding): - """ - This module learns positional embeddings up to a fixed maximum size. - """ - - def __init__(self, num_embeddings: int, embedding_dim: int): - # MBart is set up so that if padding_idx is specified then offset the embedding ids by 2 - # and adjust num_embeddings appropriately. Other models don't have this hack - self.offset = 2 - super().__init__(num_embeddings + self.offset, embedding_dim) - - def forward(self, input_ids: torch.Tensor, past_key_values_length: int = 0): - """`input_ids' shape is expected to be [bsz x seqlen].""" - - bsz, seq_len = input_ids.shape[:2] - positions = torch.arange( - past_key_values_length, past_key_values_length + seq_len, dtype=torch.long, device=self.weight.device - ).expand(bsz, -1) - - return super().forward(positions + self.offset) - - -# Copied from transformers.models.bart.modeling_bart.BartScaledWordEmbedding with Bart->MBart -class UnimerMBartScaledWordEmbedding(nn.Embedding): - """ - This module overrides nn.Embeddings' forward by multiplying with embeddings scale. - """ - - def __init__(self, num_embeddings: int, embedding_dim: int, padding_idx: int, embed_scale: Optional[float] = 1.0): - super().__init__(num_embeddings, embedding_dim, padding_idx) - self.embed_scale = embed_scale - - def forward(self, input_ids: torch.Tensor): - return super().forward(input_ids) * self.embed_scale - - -# Copied from transformers.models.bart.modeling_bart.BartAttention with Bart->MBart -class UnimerMBartAttention(nn.Module): - """Multi-headed attention from 'Attention Is All You Need' paper, with qk_squeeze""" - - def __init__( - self, - embed_dim: int, - num_heads: int, - dropout: float = 0.0, - is_decoder: bool = False, - bias: bool = True, - is_causal: bool = False, - *, - config: UnimerMBartConfig, - ): - super().__init__() - self.embed_dim = embed_dim - self.num_heads = num_heads - self.dropout = dropout - self.head_dim = embed_dim // num_heads - self.config = config - - if (self.head_dim * num_heads) != self.embed_dim: - raise ValueError( - f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim}" - f" and `num_heads`: {num_heads})." - ) - - self.squeeze_dim = embed_dim // config.qk_squeeze - self.squeeze_head_dim = self.squeeze_dim // num_heads - self.scaling = self.squeeze_head_dim**-0.5 - self.is_decoder = is_decoder - self.is_causal = is_causal - - self.q_proj = nn.Linear(embed_dim, self.squeeze_dim, bias=bias) - self.k_proj = nn.Linear(embed_dim, self.squeeze_dim, bias=bias) - self.v_proj = nn.Linear(embed_dim, embed_dim, bias=bias) - self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias) - - def _shape_qk(self, tensor: torch.Tensor, seq_len: int, bsz: int): - return tensor.view(bsz, seq_len, self.num_heads, self.squeeze_head_dim).transpose(1, 2).contiguous() - - def _shape_v(self, tensor: torch.Tensor, seq_len: int, bsz: int): - return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous() - - def forward( - self, - hidden_states: torch.Tensor, - key_value_states: Optional[torch.Tensor] = None, - past_key_value: Optional[Tuple[torch.Tensor]] = None, - attention_mask: Optional[torch.Tensor] = None, - layer_head_mask: Optional[torch.Tensor] = None, - output_attentions: bool = False, - ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: - """Input shape: Batch x Time x Channel""" - - # if key_value_states are provided this layer is used as a cross-attention layer - # for the decoder - is_cross_attention = key_value_states is not None - - bsz, tgt_len, _ = hidden_states.size() - - # get query proj - query_states = self.q_proj(hidden_states) * self.scaling - # get key, value proj - # `past_key_value[0].shape[2] == key_value_states.shape[1]` - # is checking that the `sequence_length` of the `past_key_value` is the same as - # the provided `key_value_states` to support prefix tuning - if ( - is_cross_attention - and past_key_value is not None - and past_key_value[0].shape[2] == key_value_states.shape[1] - ): - # reuse k,v, cross_attentions - key_states = past_key_value[0] - value_states = past_key_value[1] - elif is_cross_attention: - # cross_attentions - key_states = self._shape_qk(self.k_proj(key_value_states), -1, bsz) - value_states = self._shape_v(self.v_proj(key_value_states), -1, bsz) - elif past_key_value is not None: - # reuse k, v, self_attention - key_states = self._shape_qk(self.k_proj(hidden_states), -1, bsz) - value_states = self._shape_v(self.v_proj(hidden_states), -1, bsz) - key_states = torch.cat([past_key_value[0], key_states], dim=2) - value_states = torch.cat([past_key_value[1], value_states], dim=2) - else: - # self_attention - key_states = self._shape_qk(self.k_proj(hidden_states), -1, bsz) - value_states = self._shape_v(self.v_proj(hidden_states), -1, bsz) - - if self.is_decoder: - # if cross_attention save Tuple(torch.Tensor, torch.Tensor) of all cross attention key/value_states. - # Further calls to cross_attention layer can then reuse all cross-attention - # key/value_states (first "if" case) - # if uni-directional self-attention (decoder) save Tuple(torch.Tensor, torch.Tensor) of - # all previous decoder key/value_states. Further calls to uni-directional self-attention - # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case) - # if encoder bi-directional self-attention `past_key_value` is always `None` - past_key_value = (key_states, value_states) - - proj_shape = (bsz * self.num_heads, -1, self.squeeze_head_dim) - value_shape = (bsz * self.num_heads, -1, self.head_dim) - query_states = self._shape_qk(query_states, tgt_len, bsz).view(*proj_shape) - key_states = key_states.reshape(*proj_shape) - value_states = value_states.reshape(*value_shape) - - src_len = key_states.size(1) - attn_weights = torch.bmm(query_states, key_states.transpose(1, 2)) - - if attn_weights.size() != (bsz * self.num_heads, tgt_len, src_len): - raise ValueError( - f"Attention weights should be of size {(bsz * self.num_heads, tgt_len, src_len)}, but is" - f" {attn_weights.size()}" - ) - - if attention_mask is not None: - if attention_mask.size() != (bsz, 1, tgt_len, src_len): - raise ValueError( - f"Attention mask should be of size {(bsz, 1, tgt_len, src_len)}, but is {attention_mask.size()}" - ) - attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len) + attention_mask - attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len) - - attn_weights = nn.functional.softmax(attn_weights, dim=-1) - - if layer_head_mask is not None: - if layer_head_mask.size() != (self.num_heads,): - raise ValueError( - f"Head mask for a single layer should be of size {(self.num_heads,)}, but is" - f" {layer_head_mask.size()}" - ) - attn_weights = layer_head_mask.view(1, -1, 1, 1) * attn_weights.view(bsz, self.num_heads, tgt_len, src_len) - attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len) - - if output_attentions: - # this operation is a bit awkward, but it's required to - # make sure that attn_weights keeps its gradient. - # In order to do so, attn_weights have to be reshaped - # twice and have to be reused in the following - attn_weights_reshaped = attn_weights.view(bsz, self.num_heads, tgt_len, src_len) - attn_weights = attn_weights_reshaped.view(bsz * self.num_heads, tgt_len, src_len) - else: - attn_weights_reshaped = None - - attn_probs = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training) - attn_output = torch.bmm(attn_probs, value_states) - - if attn_output.size() != (bsz * self.num_heads, tgt_len, self.head_dim): - raise ValueError( - f"`attn_output` should be of size {(bsz * self.num_heads, tgt_len, self.head_dim)}, but is" - f" {attn_output.size()}" - ) - - attn_output = attn_output.view(bsz, self.num_heads, tgt_len, self.head_dim) - attn_output = attn_output.transpose(1, 2) - - # Use the `embed_dim` from the config (stored in the class) rather than `hidden_state` because `attn_output` can be - # partitioned across GPUs when using tensor-parallelism. - attn_output = attn_output.reshape(bsz, tgt_len, self.embed_dim) - - attn_output = self.out_proj(attn_output) - - return attn_output, attn_weights_reshaped, past_key_value - - -# Copied from transformers.models.bart.modeling_bart.BartFlashAttention2 with Bart->MBart -class UnimerMBartFlashAttention2(UnimerMBartAttention): - """ - MBart flash attention module. This module inherits from `MBartSqueezeAttention` as the weights of the module stays - untouched. The only required change would be on the forward pass where it needs to correctly call the public API of - flash attention and deal with padding tokens in case the input contains any of them. - """ - - # Copied from transformers.models.llama.modeling_llama.LlamaFlashAttention2.__init__ - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - - # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1. - # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignement, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0. - # Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left). - self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10() - - # def _reshape(self, tensor: torch.Tensor, seq_len: int, bsz: int): - # return tensor.view(bsz, seq_len, self.num_heads, self.head_dim) - - def _shape_qk(self, tensor: torch.Tensor, seq_len: int, bsz: int): - return tensor.view(bsz, seq_len, self.num_heads, self.squeeze_head_dim) - - def _shape_v(self, tensor: torch.Tensor, seq_len: int, bsz: int): - return tensor.view(bsz, seq_len, self.num_heads, self.head_dim) - - def forward( - self, - hidden_states: torch.Tensor, - key_value_states: Optional[torch.Tensor] = None, - past_key_value: Optional[Tuple[torch.Tensor]] = None, - attention_mask: Optional[torch.Tensor] = None, - layer_head_mask: Optional[torch.Tensor] = None, - output_attentions: bool = False, - ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: - # MBartFlashAttention2 attention does not support output_attentions - if output_attentions: - raise ValueError("MBartFlashAttention2 attention does not support output_attentions") - - # if key_value_states are provided this layer is used as a cross-attention layer - # for the decoder - is_cross_attention = key_value_states is not None - - bsz, q_len, _ = hidden_states.size() - - # get query proj - query_states = self._shape_qk(self.q_proj(hidden_states), -1, bsz) - - # get key, value proj - # `past_key_value[0].shape[2] == key_value_states.shape[1]` - # is checking that the `sequence_length` of the `past_key_value` is the same as - # the provided `key_value_states` to support prefix tuning - if ( - is_cross_attention - and past_key_value is not None - and past_key_value[0].shape[2] == key_value_states.shape[1] - ): - # reuse k,v, cross_attentions - key_states = past_key_value[0].transpose(1, 2) - value_states = past_key_value[1].transpose(1, 2) - elif is_cross_attention: - # cross_attentions - key_states = self._shape_qk(self.k_proj(key_value_states), -1, bsz) - value_states = self._shape_v(self.v_proj(key_value_states), -1, bsz) - elif past_key_value is not None: - # reuse k, v, self_attention - key_states = self._shape_qk(self.k_proj(hidden_states), -1, bsz) - value_states = self._shape_v(self.v_proj(hidden_states), -1, bsz) - key_states = torch.cat([past_key_value[0].transpose(1, 2), key_states], dim=1) - value_states = torch.cat([past_key_value[1].transpose(1, 2), value_states], dim=1) - else: - # self_attention - key_states = self._shape_qk(self.k_proj(hidden_states), -1, bsz) - value_states = self._shape_v(self.v_proj(hidden_states), -1, bsz) - - if self.is_decoder: - # if cross_attention save Tuple(torch.Tensor, torch.Tensor) of all cross attention key/value_states. - # Further calls to cross_attention layer can then reuse all cross-attention - # key/value_states (first "if" case) - # if uni-directional self-attention (decoder) save Tuple(torch.Tensor, torch.Tensor) of - # all previous decoder key/value_states. Further calls to uni-directional self-attention - # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case) - # if encoder bi-directional self-attention `past_key_value` is always `None` - past_key_value = (key_states.transpose(1, 2), value_states.transpose(1, 2)) - - kv_seq_len = key_states.shape[-2] - if past_key_value is not None: - kv_seq_len += past_key_value[0].shape[-2] - - # In PEFT, usually we cast the layer norms in float32 for training stability reasons - # therefore the input hidden states gets silently casted in float32. Hence, we need - # cast them back in the correct dtype just to be sure everything works as expected. - # This might slowdown training & inference so it is recommended to not cast the LayerNorms - # in fp32. (LlamaRMSNorm handles it correctly) - - input_dtype = query_states.dtype - if input_dtype == torch.float32: - if torch.is_autocast_enabled(): - target_dtype = torch.get_autocast_gpu_dtype() - # Handle the case where the model is quantized - elif hasattr(self.config, "_pre_quantization_dtype"): - target_dtype = self.config._pre_quantization_dtype - else: - target_dtype = self.q_proj.weight.dtype - - logger.warning_once( - f"The input hidden states seems to be silently casted in float32, this might be related to" - f" the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in" - f" {target_dtype}." - ) - - query_states = query_states.to(target_dtype) - key_states = key_states.to(target_dtype) - value_states = value_states.to(target_dtype) - - attn_output = self._flash_attention_forward( - query_states, key_states, value_states, attention_mask, q_len, dropout=self.dropout - ) - - attn_output = attn_output.reshape(bsz, q_len, -1) - attn_output = self.out_proj(attn_output) - - if not output_attentions: - attn_weights = None - - return attn_output, attn_weights, past_key_value - - # Copied from transformers.models.llama.modeling_llama.LlamaFlashAttention2._flash_attention_forward - def _flash_attention_forward( - self, query_states, key_states, value_states, attention_mask, query_length, dropout=0.0, softmax_scale=None - ): - """ - Calls the forward method of Flash Attention - if the input hidden states contain at least one padding token - first unpad the input, then computes the attention scores and pad the final attention scores. - - Args: - query_states (`torch.Tensor`): - Input query states to be passed to Flash Attention API - key_states (`torch.Tensor`): - Input key states to be passed to Flash Attention API - value_states (`torch.Tensor`): - Input value states to be passed to Flash Attention API - attention_mask (`torch.Tensor`): - The padding mask - corresponds to a tensor of size `(batch_size, seq_len)` where 0 stands for the - position of padding tokens and 1 for the position of non-padding tokens. - dropout (`float`): - Attention dropout - softmax_scale (`float`, *optional*): - The scaling of QK^T before applying softmax. Default to 1 / sqrt(head_dim) - """ - if not self._flash_attn_uses_top_left_mask: - causal = self.is_causal - else: - # TODO: Remove the `query_length != 1` check once Flash Attention for RoCm is bumped to 2.1. For details, please see the comment in LlamaFlashAttention2 __init__. - causal = self.is_causal and query_length != 1 - - # Contains at least one padding token in the sequence - if attention_mask is not None: - batch_size = query_states.shape[0] - - query_states, key_states, value_states, indices_q, cu_seq_lens, max_seq_lens = self._upad_input( - query_states, key_states, value_states, attention_mask, query_length - ) - - cu_seqlens_q, cu_seqlens_k = cu_seq_lens - max_seqlen_in_batch_q, max_seqlen_in_batch_k = max_seq_lens - - attn_output_unpad = flash_attn_varlen_func( - query_states, - key_states, - value_states, - cu_seqlens_q=cu_seqlens_q, - cu_seqlens_k=cu_seqlens_k, - max_seqlen_q=max_seqlen_in_batch_q, - max_seqlen_k=max_seqlen_in_batch_k, - dropout_p=dropout, - softmax_scale=softmax_scale, - causal=causal, - ) - - attn_output = pad_input(attn_output_unpad, indices_q, batch_size, query_length) - else: - attn_output = flash_attn_func( - query_states, key_states, value_states, dropout, softmax_scale=softmax_scale, causal=causal - ) - - return attn_output - - # Copied from transformers.models.llama.modeling_llama.LlamaFlashAttention2._upad_input - def _upad_input(self, query_layer, key_layer, value_layer, attention_mask, query_length): - indices_k, cu_seqlens_k, max_seqlen_in_batch_k = _get_unpad_data(attention_mask) - batch_size, kv_seq_len, num_key_value_heads, head_dim = key_layer.shape - - key_layer = index_first_axis( - key_layer.reshape(batch_size * kv_seq_len, num_key_value_heads, head_dim), indices_k - ) - value_layer = index_first_axis( - value_layer.reshape(batch_size * kv_seq_len, num_key_value_heads, head_dim), indices_k - ) - if query_length == kv_seq_len: - query_layer = index_first_axis( - query_layer.reshape(batch_size * kv_seq_len, self.num_heads, head_dim), indices_k - ) - cu_seqlens_q = cu_seqlens_k - max_seqlen_in_batch_q = max_seqlen_in_batch_k - indices_q = indices_k - elif query_length == 1: - max_seqlen_in_batch_q = 1 - cu_seqlens_q = torch.arange( - batch_size + 1, dtype=torch.int32, device=query_layer.device - ) # There is a memcpy here, that is very bad. - indices_q = cu_seqlens_q[:-1] - query_layer = query_layer.squeeze(1) - else: - # The -q_len: slice assumes left padding. - attention_mask = attention_mask[:, -query_length:] - query_layer, indices_q, cu_seqlens_q, max_seqlen_in_batch_q = unpad_input(query_layer, attention_mask) - - return ( - query_layer, - key_layer, - value_layer, - indices_q, - (cu_seqlens_q, cu_seqlens_k), - (max_seqlen_in_batch_q, max_seqlen_in_batch_k), - ) - -class UnimerMBartSdpaAttention(UnimerMBartAttention): - def forward( - self, - hidden_states: torch.Tensor, - key_value_states: Optional[torch.Tensor] = None, - past_key_value: Optional[Tuple[torch.Tensor]] = None, - attention_mask: Optional[torch.Tensor] = None, - layer_head_mask: Optional[torch.Tensor] = None, - output_attentions: bool = False, - ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: - """Input shape: Batch x Time x Channel""" - if output_attentions or layer_head_mask is not None: - # TODO: Improve this warning with e.g. `model.config._attn_implementation = "manual"` once this is implemented. - logger.warning( - "BartModel is using BartSdpaAttention, but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True` or `layer_head_mask` not None. Falling back to the manual attention" - ' implementation, but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.' - ) - return super().forward( - hidden_states, - key_value_states=key_value_states, - past_key_value=past_key_value, - attention_mask=attention_mask, - layer_head_mask=layer_head_mask, - output_attentions=output_attentions, - ) - - # if key_value_states are provided this layer is used as a cross-attention layer - # for the decoder - is_cross_attention = key_value_states is not None - - bsz, tgt_len, _ = hidden_states.size() - - # get query proj - query_states = self.q_proj(hidden_states) - # get key, value proj - # `past_key_value[0].shape[2] == key_value_states.shape[1]` - # is checking that the `sequence_length` of the `past_key_value` is the same as - # the provided `key_value_states` to support prefix tuning - if ( - is_cross_attention - and past_key_value is not None - and past_key_value[0].shape[2] == key_value_states.shape[1] - ): - # reuse k,v, cross_attentions - key_states = past_key_value[0] - value_states = past_key_value[1] - elif is_cross_attention: - # cross_attentions - key_states = self._shape_qk(self.k_proj(key_value_states), -1, bsz) - value_states = self._shape_v(self.v_proj(key_value_states), -1, bsz) - elif past_key_value is not None: - # reuse k, v, self_attention - key_states = self._shape_qk(self.k_proj(hidden_states), -1, bsz) - value_states = self._shape_v(self.v_proj(hidden_states), -1, bsz) - key_states = torch.cat([past_key_value[0], key_states], dim=2) - value_states = torch.cat([past_key_value[1], value_states], dim=2) - else: - # self_attention - key_states = self._shape_qk(self.k_proj(hidden_states), -1, bsz) - value_states = self._shape_v(self.v_proj(hidden_states), -1, bsz) - - if self.is_decoder: - # if cross_attention save Tuple(torch.Tensor, torch.Tensor) of all cross attention key/value_states. - # Further calls to cross_attention layer can then reuse all cross-attention - # key/value_states (first "if" case) - # if uni-directional self-attention (decoder) save Tuple(torch.Tensor, torch.Tensor) of - # all previous decoder key/value_states. Further calls to uni-directional self-attention - # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case) - # if encoder bi-directional self-attention `past_key_value` is always `None` - past_key_value = (key_states, value_states) - - query_states = self._shape_qk(query_states, tgt_len, bsz) - - # We dispatch to SDPA's Flash Attention or Efficient kernels via this `is_causal` if statement instead of an inline conditional assignment - # in SDPA to support both torch.compile's dynamic shapes and full graph options. An inline conditional prevents dynamic shapes from compiling. - # The tgt_len > 1 is necessary to match with AttentionMaskConverter.to_causal_4d that does not create a causal mask in case tgt_len == 1. - is_causal = True if self.is_causal and attention_mask is None and tgt_len > 1 else False - - # NOTE: SDPA with memory-efficient backend is currently (torch==2.1.2) bugged when using non-contiguous inputs and a custom attn_mask, - # but we are fine here as `_shape` do call `.contiguous()`. Reference: https://github.com/pytorch/pytorch/issues/112577 - attn_output = torch.nn.functional.scaled_dot_product_attention( - query_states, - key_states, - value_states, - attn_mask=attention_mask, - dropout_p=self.dropout if self.training else 0.0, - is_causal=is_causal, - ) - - if attn_output.size() != (bsz, self.num_heads, tgt_len, self.head_dim): - raise ValueError( - f"`attn_output` should be of size {(bsz, self.num_heads, tgt_len, self.head_dim)}, but is" - f" {attn_output.size()}" - ) - - attn_output = attn_output.transpose(1, 2) - - # Use the `embed_dim` from the config (stored in the class) rather than `hidden_state` because `attn_output` can be - # partitioned across GPUs when using tensor-parallelism. - attn_output = attn_output.reshape(bsz, tgt_len, self.embed_dim) - - attn_output = self.out_proj(attn_output) - - return attn_output, None, past_key_value - -UNIMER_MBART_ATTENTION_CLASSES = { - "eager": UnimerMBartAttention, - "flash_attention_2": UnimerMBartFlashAttention2, - "sdpa": UnimerMBartSdpaAttention, -} - - -class UnimerMBartEncoderLayer(nn.Module): - def __init__(self, config: UnimerMBartConfig): - super().__init__() - self.embed_dim = config.d_model - - self.self_attn = UNIMER_MBART_ATTENTION_CLASSES[config._attn_implementation]( - embed_dim=self.embed_dim, - num_heads=config.encoder_attention_heads, - dropout=config.attention_dropout, - config=config, - ) - self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim) - self.dropout = config.dropout - self.activation_fn = ACT2FN[config.activation_function] - self.activation_dropout = config.activation_dropout - self.fc1 = nn.Linear(self.embed_dim, config.encoder_ffn_dim) - self.fc2 = nn.Linear(config.encoder_ffn_dim, self.embed_dim) - self.final_layer_norm = nn.LayerNorm(self.embed_dim) - - def forward( - self, - hidden_states: torch.Tensor, - attention_mask: torch.Tensor, - layer_head_mask: torch.Tensor, - output_attentions: bool = False, - ) -> torch.Tensor: - """ - Args: - hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)` - attention_mask (`torch.FloatTensor`): attention mask of size - `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values. - layer_head_mask (`torch.FloatTensor`): mask for attention heads in a given layer of size - `(encoder_attention_heads,)`. - output_attentions (`bool`, *optional*): - Whether or not to return the attentions tensors of all attention layers. See `attentions` under - returned tensors for more detail. - """ - residual = hidden_states - hidden_states = self.self_attn_layer_norm(hidden_states) - hidden_states, attn_weights, _ = self.self_attn( - hidden_states=hidden_states, - attention_mask=attention_mask, - layer_head_mask=layer_head_mask, - output_attentions=output_attentions, - ) - hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) - hidden_states = residual + hidden_states - - residual = hidden_states - hidden_states = self.final_layer_norm(hidden_states) - hidden_states = self.activation_fn(self.fc1(hidden_states)) - hidden_states = nn.functional.dropout(hidden_states, p=self.activation_dropout, training=self.training) - hidden_states = self.fc2(hidden_states) - hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) - hidden_states = residual + hidden_states - - if hidden_states.dtype == torch.float16 and ( - torch.isinf(hidden_states).any() or torch.isnan(hidden_states).any() - ): - clamp_value = torch.finfo(hidden_states.dtype).max - 1000 - hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value) - - outputs = (hidden_states,) - - if output_attentions: - outputs += (attn_weights,) - - return outputs - - -class UnimerMBartDecoderLayer(nn.Module): - def __init__(self, config: UnimerMBartConfig): - super().__init__() - self.embed_dim = config.d_model - - self.self_attn = UNIMER_MBART_ATTENTION_CLASSES[config._attn_implementation]( - embed_dim=self.embed_dim, - num_heads=config.decoder_attention_heads, - dropout=config.attention_dropout, - is_decoder=True, - is_causal=True, - config=config, - ) - self.dropout = config.dropout - self.activation_fn = ACT2FN[config.activation_function] - self.activation_dropout = config.activation_dropout - - self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim) - self.encoder_attn = UNIMER_MBART_ATTENTION_CLASSES[config._attn_implementation]( - self.embed_dim, - config.decoder_attention_heads, - dropout=config.attention_dropout, - is_decoder=True, - config=config, - ) - self.encoder_attn_layer_norm = nn.LayerNorm(self.embed_dim) - self.fc1 = nn.Linear(self.embed_dim, config.decoder_ffn_dim) - self.fc2 = nn.Linear(config.decoder_ffn_dim, self.embed_dim) - self.final_layer_norm = nn.LayerNorm(self.embed_dim) - - def forward( - self, - hidden_states: torch.Tensor, - attention_mask: Optional[torch.Tensor] = None, - encoder_hidden_states: Optional[torch.Tensor] = None, - encoder_attention_mask: Optional[torch.Tensor] = None, - layer_head_mask: Optional[torch.Tensor] = None, - cross_attn_layer_head_mask: Optional[torch.Tensor] = None, - past_key_value: Optional[Tuple[torch.Tensor]] = None, - output_attentions: Optional[bool] = False, - use_cache: Optional[bool] = True, - ) -> torch.Tensor: - """ - Args: - hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)` - attention_mask (`torch.FloatTensor`): attention mask of size - `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values. - encoder_hidden_states (`torch.FloatTensor`): - cross attention input to the layer of shape `(batch, seq_len, embed_dim)` - encoder_attention_mask (`torch.FloatTensor`): encoder attention mask of size - `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values. - layer_head_mask (`torch.FloatTensor`): mask for attention heads in a given layer of size - `(encoder_attention_heads,)`. - cross_attn_layer_head_mask (`torch.FloatTensor`): mask for cross-attention heads in a given layer of - size `(decoder_attention_heads,)`. - past_key_value (`Tuple(torch.FloatTensor)`): cached past key and value projection states - output_attentions (`bool`, *optional*): - Whether or not to return the attentions tensors of all attention layers. See `attentions` under - returned tensors for more detail. - """ - residual = hidden_states - hidden_states = self.self_attn_layer_norm(hidden_states) - - # Self Attention - # decoder uni-directional self-attention cached key/values tuple is at positions 1,2 - self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None - # add present self-attn cache to positions 1,2 of present_key_value tuple - hidden_states, self_attn_weights, present_key_value = self.self_attn( - hidden_states=hidden_states, - past_key_value=self_attn_past_key_value, - attention_mask=attention_mask, - layer_head_mask=layer_head_mask, - output_attentions=output_attentions, - ) - hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) - hidden_states = residual + hidden_states - - # Cross-Attention Block - cross_attn_present_key_value = None - cross_attn_weights = None - if encoder_hidden_states is not None: - residual = hidden_states - hidden_states = self.encoder_attn_layer_norm(hidden_states) - - # cross_attn cached key/values tuple is at positions 3,4 of present_key_value tuple - cross_attn_past_key_value = past_key_value[-2:] if past_key_value is not None else None - hidden_states, cross_attn_weights, cross_attn_present_key_value = self.encoder_attn( - hidden_states=hidden_states, - key_value_states=encoder_hidden_states, - attention_mask=encoder_attention_mask, - layer_head_mask=cross_attn_layer_head_mask, - past_key_value=cross_attn_past_key_value, - output_attentions=output_attentions, - ) - hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) - hidden_states = residual + hidden_states - - # add cross-attn to positions 3,4 of present_key_value tuple - present_key_value = present_key_value + cross_attn_present_key_value - - # Fully Connected - residual = hidden_states - hidden_states = self.final_layer_norm(hidden_states) - hidden_states = self.activation_fn(self.fc1(hidden_states)) - hidden_states = nn.functional.dropout(hidden_states, p=self.activation_dropout, training=self.training) - hidden_states = self.fc2(hidden_states) - hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) - hidden_states = residual + hidden_states - - outputs = (hidden_states,) - - if output_attentions: - outputs += (self_attn_weights, cross_attn_weights) - - if use_cache: - outputs += (present_key_value,) - - return outputs - - -# Copied from transformers.models.bart.modeling_bart.BartClassificationHead with Bart->MBart -class UnimerMBartClassificationHead(nn.Module): - """Head for sentence-level classification tasks.""" - - def __init__( - self, - input_dim: int, - inner_dim: int, - num_classes: int, - pooler_dropout: float, - ): - super().__init__() - self.dense = nn.Linear(input_dim, inner_dim) - self.dropout = nn.Dropout(p=pooler_dropout) - self.out_proj = nn.Linear(inner_dim, num_classes) - - def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: - hidden_states = self.dropout(hidden_states) - hidden_states = self.dense(hidden_states) - hidden_states = torch.tanh(hidden_states) - hidden_states = self.dropout(hidden_states) - hidden_states = self.out_proj(hidden_states) - return hidden_states - - -class UnimerMBartPreTrainedModel(PreTrainedModel): - config_class = UnimerMBartConfig - base_model_prefix = "model" - supports_gradient_checkpointing = True - _no_split_modules = ["MBartDecoderLayer", "MBartSqueezeAttention"] - _supports_flash_attn_2 = True - _supports_sdpa = True - - def _init_weights(self, module): - std = self.config.init_std - if isinstance(module, nn.Linear): - module.weight.data.normal_(mean=0.0, std=std) - if module.bias is not None: - module.bias.data.zero_() - elif isinstance(module, nn.Embedding): - module.weight.data.normal_(mean=0.0, std=std) - if module.padding_idx is not None: - module.weight.data[module.padding_idx].zero_() - - @property - def dummy_inputs(self): - pad_token = self.config.pad_token_id - input_ids = torch.tensor([[0, 6, 10, 4, 2], [0, 8, 12, 2, pad_token]], device=self.device) - dummy_inputs = { - "attention_mask": input_ids.ne(pad_token), - "input_ids": input_ids, - } - return dummy_inputs - - -MBART_START_DOCSTRING = r""" - This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the - library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads - etc.) - - This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. - Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage - and behavior. - - Parameters: - config ([`MBartConfig`]): - Model configuration class with all the parameters of the model. Initializing with a config file does not - load the weights associated with the model, only the configuration. Check out the - [`~PreTrainedModel.from_pretrained`] method to load the model weights. -""" - -MBART_GENERATION_EXAMPLE = r""" - Translation example: - - ```python - >>> from transformers import AutoTokenizer, MBartForConditionalGeneration - - >>> model = MBartForConditionalGeneration.from_pretrained("facebook/mbart-large-en-ro") - >>> tokenizer = AutoTokenizer.from_pretrained("facebook/mbart-large-en-ro") - - >>> example_english_phrase = "42 is the answer" - >>> inputs = tokenizer(example_english_phrase, return_tensors="pt") - - >>> # Translate - >>> generated_ids = model.generate(**inputs, num_beams=4, max_length=5) - >>> tokenizer.batch_decode(generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0] - '42 este răspuns' - ``` - - Mask filling example: - - ```python - >>> from transformers import AutoTokenizer, MBartForConditionalGeneration - - >>> model = MBartForConditionalGeneration.from_pretrained("facebook/mbart-large-cc25") - >>> tokenizer = AutoTokenizer.from_pretrained("facebook/mbart-large-cc25") - - >>> # de_DE is the language symbol id for German - >>> TXT = " Meine Freunde sind nett aber sie essen zu viel Kuchen. de_DE" - - >>> input_ids = tokenizer([TXT], add_special_tokens=False, return_tensors="pt")["input_ids"] - >>> logits = model(input_ids).logits - - >>> masked_index = (input_ids[0] == tokenizer.mask_token_id).nonzero().item() - >>> probs = logits[0, masked_index].softmax(dim=0) - >>> values, predictions = probs.topk(5) - - >>> tokenizer.decode(predictions).split() - ['nett', 'sehr', 'ganz', 'nicht', 'so'] - ``` -""" - -MBART_INPUTS_DOCSTRING = r""" - Args: - input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`): - Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide - it. - - Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and - [`PreTrainedTokenizer.__call__`] for details. - - [What are input IDs?](../glossary#input-ids) - attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*): - Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: - - - 1 for tokens that are **not masked**, - - 0 for tokens that are **masked**. - - [What are attention masks?](../glossary#attention-mask) - decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*): - Indices of decoder input sequence tokens in the vocabulary. - - Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and - [`PreTrainedTokenizer.__call__`] for details. - - [What are decoder input IDs?](../glossary#decoder-input-ids) - - MBart uses a specific language id token as the starting token for `decoder_input_ids` generation that - varies according to source and target language, *e.g.* 25004 for *en_XX*, and 25003 for *de_DE*. If - `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see - `past_key_values`). - - For translation and summarization training, `decoder_input_ids` should be provided. If no - `decoder_input_ids` is provided, the model will create this tensor by shifting the `input_ids` to the right - for denoising pre-training following the paper. - decoder_attention_mask (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*): - Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also - be used by default. - head_mask (`torch.Tensor` of shape `(encoder_layers, encoder_attention_heads)`, *optional*): - Mask to nullify selected heads of the attention modules in the encoder. Mask values selected in `[0, 1]`: - - - 1 indicates the head is **not masked**, - - 0 indicates the head is **masked**. - - decoder_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*): - Mask to nullify selected heads of the attention modules in the decoder. Mask values selected in `[0, 1]`: - - - 1 indicates the head is **not masked**, - - 0 indicates the head is **masked**. - - cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*): - Mask to nullify selected heads of the cross-attention modules in the decoder. Mask values selected in `[0, - 1]`: - - - 1 indicates the head is **not masked**, - - 0 indicates the head is **masked**. - - encoder_outputs (`tuple(tuple(torch.FloatTensor)`, *optional*): - Tuple consists of (`last_hidden_state`, *optional*: `hidden_states`, *optional*: `attentions`) - `last_hidden_state` of shape `(batch_size, sequence_length, hidden_size)`, *optional*) is a sequence of - hidden-states at the output of the last layer of the encoder. Used in the cross-attention of the decoder. - past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`): - Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape - `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape - `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`. - - Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention - blocks) that can be used (see `past_key_values` input) to speed up sequential decoding. - - If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that - don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all - `decoder_input_ids` of shape `(batch_size, sequence_length)`. - inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): - Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. - This is useful if you want more control over how to convert `input_ids` indices into associated vectors - than the model's internal embedding lookup matrix. - decoder_inputs_embeds (`torch.FloatTensor` of shape `(batch_size, target_sequence_length, hidden_size)`, *optional*): - Optionally, instead of passing `decoder_input_ids` you can choose to directly pass an embedded - representation. If `past_key_values` is used, optionally only the last `decoder_inputs_embeds` have to be - input (see `past_key_values`). This is useful if you want more control over how to convert - `decoder_input_ids` indices into associated vectors than the model's internal embedding lookup matrix. - - If `decoder_input_ids` and `decoder_inputs_embeds` are both unset, `decoder_inputs_embeds` takes the value - of `inputs_embeds`. - use_cache (`bool`, *optional*): - If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see - `past_key_values`). - output_attentions (`bool`, *optional*): - Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned - tensors for more detail. - output_hidden_states (`bool`, *optional*): - Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for - more detail. - return_dict (`bool`, *optional*): - Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. -""" - - -class UnimerMBartEncoder(UnimerMBartPreTrainedModel): - """ - Transformer encoder consisting of *config.encoder_layers* self attention layers. Each layer is a - [`MBartEncoderLayer`]. - - Args: - config: MBartConfig - embed_tokens (nn.Embedding): output embedding - """ - - def __init__(self, config: UnimerMBartConfig, embed_tokens: Optional[nn.Embedding] = None): - super().__init__(config) - - self.dropout = config.dropout - self.layerdrop = config.encoder_layerdrop - - embed_dim = config.d_model - self.padding_idx = config.pad_token_id - self.max_source_positions = config.max_position_embeddings - embed_scale = math.sqrt(embed_dim) if config.scale_embedding else 1.0 - - self.embed_tokens = UnimerMBartScaledWordEmbedding( - config.vocab_size, embed_dim, self.padding_idx, embed_scale=embed_scale - ) - - if embed_tokens is not None: - self.embed_tokens.weight = embed_tokens.weight - - self.embed_positions = UnimerMBartLearnedPositionalEmbedding( - config.max_position_embeddings, - embed_dim, - ) - self.layers = nn.ModuleList([UnimerMBartEncoderLayer(config) for _ in range(config.encoder_layers)]) - self._use_flash_attention_2 = config._attn_implementation == "flash_attention_2" - self._use_sdpa = config._attn_implementation == "sdpa" - self.layernorm_embedding = nn.LayerNorm(embed_dim) - self.layer_norm = nn.LayerNorm(config.d_model) - - self.gradient_checkpointing = False - # Initialize weights and apply final processing - self.post_init() - - def _backward_compatibility_gradient_checkpointing(self): - # Override to not delete the attribute from the config - if self.supports_gradient_checkpointing and getattr(self.config, "gradient_checkpointing", False): - self.gradient_checkpointing_enable() - - def forward( - self, - input_ids: torch.LongTensor = None, - attention_mask: Optional[torch.Tensor] = None, - head_mask: Optional[torch.Tensor] = None, - inputs_embeds: Optional[torch.FloatTensor] = None, - output_attentions: Optional[bool] = None, - output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[Tuple, BaseModelOutput]: - r""" - Args: - input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`): - Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you - provide it. - - Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and - [`PreTrainedTokenizer.__call__`] for details. - - [What are input IDs?](../glossary#input-ids) - attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*): - Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: - - - 1 for tokens that are **not masked**, - - 0 for tokens that are **masked**. - - [What are attention masks?](../glossary#attention-mask) - head_mask (`torch.Tensor` of shape `(encoder_layers, encoder_attention_heads)`, *optional*): - Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`: - - - 1 indicates the head is **not masked**, - - 0 indicates the head is **masked**. - - inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): - Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. - This is useful if you want more control over how to convert `input_ids` indices into associated vectors - than the model's internal embedding lookup matrix. - output_attentions (`bool`, *optional*): - Whether or not to return the attentions tensors of all attention layers. See `attentions` under - returned tensors for more detail. - output_hidden_states (`bool`, *optional*): - Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors - for more detail. - return_dict (`bool`, *optional*): - Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. - """ - output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions - output_hidden_states = ( - output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states - ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - - # retrieve input_ids and inputs_embeds - if input_ids is not None and inputs_embeds is not None: - raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") - elif input_ids is not None: - input = input_ids - input_shape = input.shape - input_ids = input_ids.view(-1, input_shape[-1]) - elif inputs_embeds is not None: - input = inputs_embeds[:, :, -1] - else: - raise ValueError("You have to specify either input_ids or inputs_embeds") - - if inputs_embeds is None: - inputs_embeds = self.embed_tokens(input_ids) - - embed_pos = self.embed_positions(input) - - hidden_states = inputs_embeds + embed_pos.to(inputs_embeds.device) - hidden_states = self.layernorm_embedding(hidden_states) - hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) - - # expand attention_mask - if attention_mask is not None: - # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len] - if self._use_flash_attention_2: - attention_mask = attention_mask if 0 in attention_mask else None - elif self._use_sdpa and head_mask is None and not output_attentions: - # output_attentions=True & head_mask can not be supported when using SDPA, fall back to - # the manual implementation that requires a 4D causal mask in all cases. - # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len] - attention_mask = _prepare_4d_attention_mask_for_sdpa(attention_mask, inputs_embeds.dtype) - else: - # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len] - attention_mask = _prepare_4d_attention_mask(attention_mask, inputs_embeds.dtype) - - encoder_states = () if output_hidden_states else None - all_attentions = () if output_attentions else None - - # check if head_mask has a correct number of layers specified if desired - if head_mask is not None: - if head_mask.size()[0] != len(self.layers): - raise ValueError( - f"The head_mask should be specified for {len(self.layers)} layers, but it is for" - f" {head_mask.size()[0]}." - ) - for idx, encoder_layer in enumerate(self.layers): - if output_hidden_states: - encoder_states = encoder_states + (hidden_states,) - # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description) - to_drop = False - if self.training: - dropout_probability = torch.rand([]) - if dropout_probability < self.layerdrop: # skip the layer - to_drop = True - - if to_drop: - layer_outputs = (None, None) - else: - if self.gradient_checkpointing and self.training: - layer_outputs = self._gradient_checkpointing_func( - encoder_layer.__call__, - hidden_states, - attention_mask, - (head_mask[idx] if head_mask is not None else None), - output_attentions, - ) - else: - layer_outputs = encoder_layer( - hidden_states, - attention_mask, - layer_head_mask=(head_mask[idx] if head_mask is not None else None), - output_attentions=output_attentions, - ) - - hidden_states = layer_outputs[0] - - if output_attentions: - all_attentions = all_attentions + (layer_outputs[1],) - - hidden_states = self.layer_norm(hidden_states) - - if output_hidden_states: - encoder_states = encoder_states + (hidden_states,) - - if not return_dict: - return tuple(v for v in [hidden_states, encoder_states, all_attentions] if v is not None) - return BaseModelOutput( - last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions - ) - - -class UnimerMBartDecoder(UnimerMBartPreTrainedModel): - """ - Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a [`MBartDecoderLayer`] - - Args: - config: MBartConfig - embed_tokens (nn.Embedding): output embedding - """ - - def __init__(self, config: UnimerMBartConfig, embed_tokens: Optional[nn.Embedding] = None): - super().__init__(config) - self.dropout = config.dropout - self.layerdrop = config.decoder_layerdrop - self.padding_idx = config.pad_token_id - self.max_target_positions = config.max_position_embeddings - embed_scale = math.sqrt(config.d_model) if config.scale_embedding else 1.0 - - self.embed_tokens = UnimerMBartScaledWordEmbedding( - config.vocab_size, config.d_model, self.padding_idx, embed_scale=embed_scale - ) - - if embed_tokens is not None: - self.embed_tokens.weight = embed_tokens.weight - - self.embed_positions = UnimerMBartLearnedPositionalEmbedding( - config.max_position_embeddings, - config.d_model, - ) - self.layers = nn.ModuleList([UnimerMBartDecoderLayer(config) for _ in range(config.decoder_layers)]) - self._use_flash_attention_2 = config._attn_implementation == "flash_attention_2" - self._use_sdpa = config._attn_implementation == "sdpa" - self.layernorm_embedding = nn.LayerNorm(config.d_model) - self.layer_norm = nn.LayerNorm(config.d_model) - - self.gradient_checkpointing = False - # Initialize weights and apply final processing - self.post_init() - - def get_input_embeddings(self): - return self.embed_tokens - - def set_input_embeddings(self, value): - self.embed_tokens = value - - def forward( - self, - input_ids: torch.LongTensor = None, - attention_mask: Optional[torch.Tensor] = None, - count_pred: Optional[torch.FloatTensor] = None, - encoder_hidden_states: Optional[torch.FloatTensor] = None, - encoder_attention_mask: Optional[torch.LongTensor] = None, - head_mask: Optional[torch.Tensor] = None, - cross_attn_head_mask: Optional[torch.Tensor] = None, - past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None, - inputs_embeds: Optional[torch.FloatTensor] = None, - use_cache: Optional[bool] = None, - output_attentions: Optional[bool] = None, - output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[Tuple, BaseModelOutputWithPastAndCrossAttentions]: - r""" - Args: - input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`): - Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you - provide it. - - Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and - [`PreTrainedTokenizer.__call__`] for details. - - [What are input IDs?](../glossary#input-ids) - attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*): - Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: - - - 1 for tokens that are **not masked**, - - 0 for tokens that are **masked**. - - [What are attention masks?](../glossary#attention-mask) - encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, encoder_sequence_length, hidden_size)`, *optional*): - Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention - of the decoder. - encoder_attention_mask (`torch.LongTensor` of shape `(batch_size, encoder_sequence_length)`, *optional*): - Mask to avoid performing cross-attention on padding tokens indices of encoder input_ids. Mask values - selected in `[0, 1]`: - - - 1 for tokens that are **not masked**, - - 0 for tokens that are **masked**. - - [What are attention masks?](../glossary#attention-mask) - head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*): - Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`: - - - 1 indicates the head is **not masked**, - - 0 indicates the head is **masked**. - - cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*): - Mask to nullify selected heads of the cross-attention modules in the decoder to avoid performing - cross-attention on hidden heads. Mask values selected in `[0, 1]`: - - - 1 indicates the head is **not masked**, - - 0 indicates the head is **masked**. - - past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`): - Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of - shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of - shape `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`. - - Contains pre-computed hidden-states (key and values in the self-attention blocks and in the - cross-attention blocks) that can be used (see `past_key_values` input) to speed up sequential decoding. - - If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those - that don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of - all `decoder_input_ids` of shape `(batch_size, sequence_length)`. - inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): - Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. - This is useful if you want more control over how to convert `input_ids` indices into associated vectors - than the model's internal embedding lookup matrix. - output_attentions (`bool`, *optional*): - Whether or not to return the attentions tensors of all attention layers. See `attentions` under - returned tensors for more detail. - output_hidden_states (`bool`, *optional*): - Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors - for more detail. - return_dict (`bool`, *optional*): - Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. - """ - output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions - output_hidden_states = ( - output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states - ) - use_cache = use_cache if use_cache is not None else self.config.use_cache - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - - # retrieve input_ids and inputs_embeds - if input_ids is not None and inputs_embeds is not None: - raise ValueError("You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time") - elif input_ids is not None: - input = input_ids - input_shape = input.size() - input_ids = input_ids.view(-1, input_shape[-1]) - elif inputs_embeds is not None: - input_shape = inputs_embeds.size()[:-1] - input = inputs_embeds[:, :, -1] - else: - raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds") - - # past_key_values_length - past_key_values_length = past_key_values[0][0].shape[2] if past_key_values is not None else 0 - - if inputs_embeds is None: - inputs_embeds = self.embed_tokens(input_ids) - - if self._use_flash_attention_2: - # 2d mask is passed through the layers - attention_mask = attention_mask if (attention_mask is not None and 0 in attention_mask) else None - elif self._use_sdpa and not output_attentions and cross_attn_head_mask is None: - # output_attentions=True & cross_attn_head_mask can not be supported when using SDPA, and we fall back on - # the manual implementation that requires a 4D causal mask in all cases. - attention_mask = _prepare_4d_causal_attention_mask_for_sdpa( - attention_mask, - input_shape, - inputs_embeds, - past_key_values_length, - ) - else: - # 4d mask is passed through the layers - attention_mask = _prepare_4d_causal_attention_mask( - attention_mask, input_shape, inputs_embeds, past_key_values_length - ) - - # expand encoder attention mask - if encoder_hidden_states is not None and encoder_attention_mask is not None: - if self._use_flash_attention_2: - encoder_attention_mask = encoder_attention_mask if 0 in encoder_attention_mask else None - elif self._use_sdpa and cross_attn_head_mask is None and not output_attentions: - # output_attentions=True & cross_attn_head_mask can not be supported when using SDPA, and we fall back on - # the manual implementation that requires a 4D causal mask in all cases. - # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len] - encoder_attention_mask = _prepare_4d_attention_mask_for_sdpa( - encoder_attention_mask, - inputs_embeds.dtype, - tgt_len=input_shape[-1], - ) - else: - # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len] - encoder_attention_mask = _prepare_4d_attention_mask( - encoder_attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1] - ) - - # embed positions - positions = self.embed_positions(input, past_key_values_length) - - hidden_states = inputs_embeds + positions.to(inputs_embeds.device) - - # TODO: add counting context weight to hidden_states - if count_pred is not None: - count_context_weight = self.counting_context_weight(count_pred) - hidden_states = hidden_states + 0.5 * count_context_weight.unsqueeze(1) - - hidden_states = self.layernorm_embedding(hidden_states) - hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) - - if self.gradient_checkpointing and self.training: - if use_cache: - logger.warning_once( - "`use_cache=True` is incompatible with gradient checkpointing`. Setting `use_cache=False`..." - ) - use_cache = False - - # decoder layers - all_hidden_states = () if output_hidden_states else None - all_self_attns = () if output_attentions else None - all_cross_attentions = () if (output_attentions and encoder_hidden_states is not None) else None - next_decoder_cache = () if use_cache else None - - # check if head_mask/cross_attn_head_mask has a correct number of layers specified if desired - for attn_mask, mask_name in zip([head_mask, cross_attn_head_mask], ["head_mask", "cross_attn_head_mask"]): - if attn_mask is not None: - if attn_mask.size()[0] != len(self.layers): - raise ValueError( - f"The `{mask_name}` should be specified for {len(self.layers)} layers, but it is for" - f" {attn_mask.size()[0]}." - ) - for idx, decoder_layer in enumerate(self.layers): - # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description) - if output_hidden_states: - all_hidden_states += (hidden_states,) - if self.training: - dropout_probability = torch.rand([]) - if dropout_probability < self.layerdrop: - continue - - past_key_value = past_key_values[idx] if past_key_values is not None else None - - if self.gradient_checkpointing and self.training: - layer_outputs = self._gradient_checkpointing_func( - decoder_layer.__call__, - hidden_states, - attention_mask, - encoder_hidden_states, - encoder_attention_mask, - head_mask[idx] if head_mask is not None else None, - cross_attn_head_mask[idx] if cross_attn_head_mask is not None else None, - None, - output_attentions, - use_cache, - ) - else: - layer_outputs = decoder_layer( - hidden_states, - attention_mask=attention_mask, - encoder_hidden_states=encoder_hidden_states, - encoder_attention_mask=encoder_attention_mask, - layer_head_mask=(head_mask[idx] if head_mask is not None else None), - cross_attn_layer_head_mask=( - cross_attn_head_mask[idx] if cross_attn_head_mask is not None else None - ), - past_key_value=past_key_value, - output_attentions=output_attentions, - use_cache=use_cache, - ) - hidden_states = layer_outputs[0] - - if use_cache: - next_decoder_cache += (layer_outputs[3 if output_attentions else 1],) - - if output_attentions: - all_self_attns += (layer_outputs[1],) - - if encoder_hidden_states is not None: - all_cross_attentions += (layer_outputs[2],) - - hidden_states = self.layer_norm(hidden_states) - - # add hidden states from the last decoder layer - if output_hidden_states: - all_hidden_states += (hidden_states,) - - next_cache = next_decoder_cache if use_cache else None - if not return_dict: - return tuple( - v - for v in [hidden_states, next_cache, all_hidden_states, all_self_attns, all_cross_attentions] - if v is not None - ) - return BaseModelOutputWithPastAndCrossAttentions( - last_hidden_state=hidden_states, - past_key_values=next_cache, - hidden_states=all_hidden_states, - attentions=all_self_attns, - cross_attentions=all_cross_attentions, - ) - - -@add_start_docstrings( - "The bare MBART Model outputting raw hidden-states without any specific head on top.", - MBART_START_DOCSTRING, -) -class UnimerMBartModel(UnimerMBartPreTrainedModel): - _tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"] - - def __init__(self, config: UnimerMBartConfig): - super().__init__(config) - - padding_idx, vocab_size = config.pad_token_id, config.vocab_size - self.shared = nn.Embedding(vocab_size, config.d_model, padding_idx) - - self.encoder = UnimerMBartEncoder(config, self.shared) - self.decoder = UnimerMBartDecoder(config, self.shared) - - # Initialize weights and apply final processing - self.post_init() - - def get_input_embeddings(self): - return self.shared - - def set_input_embeddings(self, value): - self.shared = value - self.encoder.embed_tokens = self.shared - self.decoder.embed_tokens = self.shared - - def get_encoder(self): - return self.encoder - - def get_decoder(self): - return self.decoder - - def _tie_weights(self): - if self.config.tie_word_embeddings: - self._tie_or_clone_weights(self.encoder.embed_tokens, self.get_input_embeddings()) - self._tie_or_clone_weights(self.decoder.embed_tokens, self.get_input_embeddings()) - - @add_start_docstrings_to_model_forward(MBART_INPUTS_DOCSTRING) - @add_code_sample_docstrings( - checkpoint=_CHECKPOINT_FOR_DOC, - output_type=Seq2SeqModelOutput, - config_class=_CONFIG_FOR_DOC, - expected_output=_EXPECTED_OUTPUT_SHAPE, - ) - def forward( - self, - input_ids: torch.LongTensor = None, - attention_mask: Optional[torch.Tensor] = None, - decoder_input_ids: Optional[torch.LongTensor] = None, - decoder_attention_mask: Optional[torch.LongTensor] = None, - head_mask: Optional[torch.Tensor] = None, - decoder_head_mask: Optional[torch.Tensor] = None, - cross_attn_head_mask: Optional[torch.Tensor] = None, - encoder_outputs: Optional[Tuple[Tuple[torch.FloatTensor]]] = None, - past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None, - inputs_embeds: Optional[torch.FloatTensor] = None, - decoder_inputs_embeds: Optional[torch.FloatTensor] = None, - use_cache: Optional[bool] = None, - output_attentions: Optional[bool] = None, - output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[Seq2SeqModelOutput, Tuple[torch.FloatTensor]]: - output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions - output_hidden_states = ( - output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states - ) - use_cache = use_cache if use_cache is not None else self.config.use_cache - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - - # different to other models, MBart automatically creates decoder_input_ids from - # input_ids if no decoder_input_ids are provided - if decoder_input_ids is None and decoder_inputs_embeds is None: - decoder_input_ids = shift_tokens_right(input_ids, self.config.pad_token_id) - - if encoder_outputs is None: - encoder_outputs = self.encoder( - input_ids=input_ids, - attention_mask=attention_mask, - head_mask=head_mask, - inputs_embeds=inputs_embeds, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, - return_dict=return_dict, - ) - # If the user passed a tuple for encoder_outputs, we wrap it in a BaseModelOutput when return_dict=True - elif return_dict and not isinstance(encoder_outputs, BaseModelOutput): - encoder_outputs = BaseModelOutput( - last_hidden_state=encoder_outputs[0], - hidden_states=encoder_outputs[1] if len(encoder_outputs) > 1 else None, - attentions=encoder_outputs[2] if len(encoder_outputs) > 2 else None, - ) - - # decoder outputs consists of (dec_features, past_key_value, dec_hidden, dec_attn) - decoder_outputs = self.decoder( - input_ids=decoder_input_ids, - attention_mask=decoder_attention_mask, - encoder_hidden_states=encoder_outputs[0], - encoder_attention_mask=attention_mask, - head_mask=decoder_head_mask, - cross_attn_head_mask=cross_attn_head_mask, - past_key_values=past_key_values, - inputs_embeds=decoder_inputs_embeds, - use_cache=use_cache, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, - return_dict=return_dict, - ) - - if not return_dict: - return decoder_outputs + encoder_outputs - - return Seq2SeqModelOutput( - last_hidden_state=decoder_outputs.last_hidden_state, - past_key_values=decoder_outputs.past_key_values, - decoder_hidden_states=decoder_outputs.hidden_states, - decoder_attentions=decoder_outputs.attentions, - cross_attentions=decoder_outputs.cross_attentions, - encoder_last_hidden_state=encoder_outputs.last_hidden_state, - encoder_hidden_states=encoder_outputs.hidden_states, - encoder_attentions=encoder_outputs.attentions, - ) - - -@add_start_docstrings( - "The MBART Model with a language modeling head. Can be used for summarization, after fine-tuning the pretrained models.", - MBART_START_DOCSTRING, -) -class UnimerMBartForConditionalGeneration(UnimerMBartPreTrainedModel, GenerationMixin): - base_model_prefix = "model" - _keys_to_ignore_on_load_missing = ["final_logits_bias"] - _tied_weights_keys = ["model.encoder.embed_tokens.weight", "model.decoder.embed_tokens.weight", "lm_head.weight"] - - def __init__(self, config: UnimerMBartConfig): - super().__init__(config) - self.model = UnimerMBartModel(config) - self.register_buffer("final_logits_bias", torch.zeros((1, self.model.shared.num_embeddings))) - self.lm_head = nn.Linear(config.d_model, self.model.shared.num_embeddings, bias=False) - - # Initialize weights and apply final processing - self.post_init() - - def get_encoder(self): - return self.model.get_encoder() - - def get_decoder(self): - return self.model.get_decoder() - - def resize_token_embeddings(self, new_num_tokens: int, pad_to_multiple_of: Optional[int] = None) -> nn.Embedding: - new_embeddings = super().resize_token_embeddings(new_num_tokens, pad_to_multiple_of) - self._resize_final_logits_bias(new_embeddings.weight.shape[0]) - return new_embeddings - - def _resize_final_logits_bias(self, new_num_tokens: int) -> None: - old_num_tokens = self.final_logits_bias.shape[-1] - if new_num_tokens <= old_num_tokens: - new_bias = self.final_logits_bias[:, :new_num_tokens] - else: - extra_bias = torch.zeros((1, new_num_tokens - old_num_tokens), device=self.final_logits_bias.device) - new_bias = torch.cat([self.final_logits_bias, extra_bias], dim=1) - self.register_buffer("final_logits_bias", new_bias) - - def get_output_embeddings(self): - return self.lm_head - - def set_output_embeddings(self, new_embeddings): - self.lm_head = new_embeddings - - @add_start_docstrings_to_model_forward(MBART_INPUTS_DOCSTRING) - @replace_return_docstrings(output_type=Seq2SeqLMOutput, config_class=_CONFIG_FOR_DOC) - @add_end_docstrings(MBART_GENERATION_EXAMPLE) - def forward( - self, - input_ids: torch.LongTensor = None, - attention_mask: Optional[torch.Tensor] = None, - decoder_input_ids: Optional[torch.LongTensor] = None, - decoder_attention_mask: Optional[torch.LongTensor] = None, - head_mask: Optional[torch.Tensor] = None, - decoder_head_mask: Optional[torch.Tensor] = None, - cross_attn_head_mask: Optional[torch.Tensor] = None, - encoder_outputs: Optional[Tuple[Tuple[torch.FloatTensor]]] = None, - past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None, - inputs_embeds: Optional[torch.FloatTensor] = None, - decoder_inputs_embeds: Optional[torch.FloatTensor] = None, - labels: Optional[torch.LongTensor] = None, - use_cache: Optional[bool] = None, - output_attentions: Optional[bool] = None, - output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[Seq2SeqLMOutput, Tuple[torch.FloatTensor]]: - r""" - labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): - Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., - config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored - (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`. - - Returns: - - """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - - if labels is not None: - if use_cache: - logger.warning("The `use_cache` argument is changed to `False` since `labels` is provided.") - use_cache = False - if decoder_input_ids is None and decoder_inputs_embeds is None: - decoder_input_ids = shift_tokens_right(labels, self.config.pad_token_id) - - outputs = self.model( - input_ids, - attention_mask=attention_mask, - decoder_input_ids=decoder_input_ids, - encoder_outputs=encoder_outputs, - decoder_attention_mask=decoder_attention_mask, - head_mask=head_mask, - decoder_head_mask=decoder_head_mask, - cross_attn_head_mask=cross_attn_head_mask, - past_key_values=past_key_values, - inputs_embeds=inputs_embeds, - decoder_inputs_embeds=decoder_inputs_embeds, - use_cache=use_cache, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, - return_dict=return_dict, - ) - lm_logits = self.lm_head(outputs[0]) + self.final_logits_bias - - masked_lm_loss = None - if labels is not None: - loss_fct = CrossEntropyLoss() - masked_lm_loss = loss_fct(lm_logits.view(-1, self.config.vocab_size), labels.view(-1)) - - if not return_dict: - output = (lm_logits,) + outputs[1:] - return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output - - return Seq2SeqLMOutput( - loss=masked_lm_loss, - logits=lm_logits, - past_key_values=outputs.past_key_values, - decoder_hidden_states=outputs.decoder_hidden_states, - decoder_attentions=outputs.decoder_attentions, - cross_attentions=outputs.cross_attentions, - encoder_last_hidden_state=outputs.encoder_last_hidden_state, - encoder_hidden_states=outputs.encoder_hidden_states, - encoder_attentions=outputs.encoder_attentions, - ) - - def prepare_inputs_for_generation( - self, - decoder_input_ids, - past_key_values=None, - attention_mask=None, - head_mask=None, - decoder_head_mask=None, - cross_attn_head_mask=None, - use_cache=None, - encoder_outputs=None, - **kwargs, - ): - # cut decoder_input_ids if past is used - if past_key_values is not None: - past_length = past_key_values[0][0].shape[2] - - # Some generation methods already pass only the last input ID - if decoder_input_ids.shape[1] > past_length: - remove_prefix_length = past_length - else: - # Default to old behavior: keep only final ID - remove_prefix_length = decoder_input_ids.shape[1] - 1 - - decoder_input_ids = decoder_input_ids[:, remove_prefix_length:] - - return { - "input_ids": None, # encoder_outputs is defined. input_ids not needed - "encoder_outputs": encoder_outputs, - "past_key_values": past_key_values, - "decoder_input_ids": decoder_input_ids, - "attention_mask": attention_mask, - "head_mask": head_mask, - "decoder_head_mask": decoder_head_mask, - "cross_attn_head_mask": cross_attn_head_mask, - "use_cache": use_cache, # change this to avoid caching (presumably for debugging) - } - - def prepare_decoder_input_ids_from_labels(self, labels: torch.Tensor): - return shift_tokens_right(labels, self.config.pad_token_id) - - @staticmethod - def _reorder_cache(past_key_values, beam_idx): - reordered_past = () - for layer_past in past_key_values: - # cached cross_attention states don't have to be reordered -> they are always the same - reordered_past += ( - tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past[:2]) - + layer_past[2:], - ) - return reordered_past - - -@add_start_docstrings( - """ - MBart model with a sequence classification/head on top (a linear layer on top of the pooled output) e.g. for GLUE - tasks. - """, - MBART_START_DOCSTRING, -) -class UnimerMBartForSequenceClassification(UnimerMBartPreTrainedModel): - _tied_weights_keys = ["model.encoder.embed_tokens.weight", "model.decoder.embed_tokens.weight"] - - def __init__(self, config: UnimerMBartConfig, **kwargs): - super().__init__(config, **kwargs) - self.model = UnimerMBartModel(config) - self.classification_head = UnimerMBartClassificationHead( - config.d_model, - config.d_model, - config.num_labels, - config.classifier_dropout, - ) - - # Initialize weights and apply final processing - self.post_init() - - @add_start_docstrings_to_model_forward(MBART_INPUTS_DOCSTRING) - @add_code_sample_docstrings( - checkpoint=_CHECKPOINT_FOR_DOC, - output_type=Seq2SeqSequenceClassifierOutput, - config_class=_CONFIG_FOR_DOC, - ) - # Copied from transformers.models.bart.modeling_bart.BartForSequenceClassification.forward - def forward( - self, - input_ids: torch.LongTensor = None, - attention_mask: Optional[torch.Tensor] = None, - decoder_input_ids: Optional[torch.LongTensor] = None, - decoder_attention_mask: Optional[torch.LongTensor] = None, - head_mask: Optional[torch.Tensor] = None, - decoder_head_mask: Optional[torch.Tensor] = None, - cross_attn_head_mask: Optional[torch.Tensor] = None, - encoder_outputs: Optional[List[torch.FloatTensor]] = None, - inputs_embeds: Optional[torch.FloatTensor] = None, - decoder_inputs_embeds: Optional[torch.FloatTensor] = None, - labels: Optional[torch.LongTensor] = None, - use_cache: Optional[bool] = None, - output_attentions: Optional[bool] = None, - output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[Tuple, Seq2SeqSequenceClassifierOutput]: - r""" - labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): - Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., - config.num_labels - 1]`. If `config.num_labels > 1` a classification loss is computed (Cross-Entropy). - """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - if labels is not None: - use_cache = False - - if input_ids is None and inputs_embeds is not None: - raise NotImplementedError( - f"Passing input embeddings is currently not supported for {self.__class__.__name__}" - ) - - outputs = self.model( - input_ids, - attention_mask=attention_mask, - decoder_input_ids=decoder_input_ids, - decoder_attention_mask=decoder_attention_mask, - head_mask=head_mask, - decoder_head_mask=decoder_head_mask, - cross_attn_head_mask=cross_attn_head_mask, - encoder_outputs=encoder_outputs, - inputs_embeds=inputs_embeds, - decoder_inputs_embeds=decoder_inputs_embeds, - use_cache=use_cache, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, - return_dict=return_dict, - ) - hidden_states = outputs[0] # last hidden state - - eos_mask = input_ids.eq(self.config.eos_token_id).to(hidden_states.device) - - if len(torch.unique_consecutive(eos_mask.sum(1))) > 1: - raise ValueError("All examples must have the same number of tokens.") - sentence_representation = hidden_states[eos_mask, :].view(hidden_states.size(0), -1, hidden_states.size(-1))[ - :, -1, : - ] - logits = self.classification_head(sentence_representation) - - loss = None - if labels is not None: - labels = labels.to(logits.device) - if self.config.problem_type is None: - if self.config.num_labels == 1: - self.config.problem_type = "regression" - elif self.config.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int): - self.config.problem_type = "single_label_classification" - else: - self.config.problem_type = "multi_label_classification" - - if self.config.problem_type == "regression": - loss_fct = MSELoss() - if self.config.num_labels == 1: - loss = loss_fct(logits.squeeze(), labels.squeeze()) - else: - loss = loss_fct(logits, labels) - elif self.config.problem_type == "single_label_classification": - loss_fct = CrossEntropyLoss() - loss = loss_fct(logits.view(-1, self.config.num_labels), labels.view(-1)) - elif self.config.problem_type == "multi_label_classification": - loss_fct = BCEWithLogitsLoss() - loss = loss_fct(logits, labels) - if not return_dict: - output = (logits,) + outputs[1:] - return ((loss,) + output) if loss is not None else output - - return Seq2SeqSequenceClassifierOutput( - loss=loss, - logits=logits, - past_key_values=outputs.past_key_values, - decoder_hidden_states=outputs.decoder_hidden_states, - decoder_attentions=outputs.decoder_attentions, - cross_attentions=outputs.cross_attentions, - encoder_last_hidden_state=outputs.encoder_last_hidden_state, - encoder_hidden_states=outputs.encoder_hidden_states, - encoder_attentions=outputs.encoder_attentions, - ) - - -@add_start_docstrings( - """ - MBART Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear - layer on top of the hidden-states output to compute `span start logits` and `span end logits`). - """, - MBART_START_DOCSTRING, -) -class UnimerMBartForQuestionAnswering(UnimerMBartPreTrainedModel): - _tied_weights_keys = ["model.encoder.embed_tokens.weight", "model.decoder.embed_tokens.weight"] - - def __init__(self, config): - super().__init__(config) - - config.num_labels = 2 - self.num_labels = config.num_labels - - self.model = UnimerMBartModel(config) - self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels) - - # Initialize weights and apply final processing - self.post_init() - - @add_start_docstrings_to_model_forward(MBART_INPUTS_DOCSTRING) - @add_code_sample_docstrings( - checkpoint=_CHECKPOINT_FOR_DOC, - output_type=Seq2SeqQuestionAnsweringModelOutput, - config_class=_CONFIG_FOR_DOC, - ) - # Copied from transformers.models.bart.modeling_bart.BartForQuestionAnswering.forward - def forward( - self, - input_ids: torch.Tensor = None, - attention_mask: Optional[torch.Tensor] = None, - decoder_input_ids: Optional[torch.LongTensor] = None, - decoder_attention_mask: Optional[torch.LongTensor] = None, - head_mask: Optional[torch.Tensor] = None, - decoder_head_mask: Optional[torch.Tensor] = None, - cross_attn_head_mask: Optional[torch.Tensor] = None, - encoder_outputs: Optional[List[torch.FloatTensor]] = None, - start_positions: Optional[torch.LongTensor] = None, - end_positions: Optional[torch.LongTensor] = None, - inputs_embeds: Optional[torch.FloatTensor] = None, - decoder_inputs_embeds: Optional[torch.FloatTensor] = None, - use_cache: Optional[bool] = None, - output_attentions: Optional[bool] = None, - output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[Tuple, Seq2SeqQuestionAnsweringModelOutput]: - r""" - start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*): - Labels for position (index) of the start of the labelled span for computing the token classification loss. - Positions are clamped to the length of the sequence (*sequence_length*). Position outside of the sequence - are not taken into account for computing the loss. - end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*): - Labels for position (index) of the end of the labelled span for computing the token classification loss. - Positions are clamped to the length of the sequence (*sequence_length*). Position outside of the sequence - are not taken into account for computing the loss. - """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - if start_positions is not None and end_positions is not None: - use_cache = False - - outputs = self.model( - input_ids, - attention_mask=attention_mask, - decoder_input_ids=decoder_input_ids, - decoder_attention_mask=decoder_attention_mask, - head_mask=head_mask, - decoder_head_mask=decoder_head_mask, - cross_attn_head_mask=cross_attn_head_mask, - encoder_outputs=encoder_outputs, - inputs_embeds=inputs_embeds, - decoder_inputs_embeds=decoder_inputs_embeds, - use_cache=use_cache, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, - return_dict=return_dict, - ) - - sequence_output = outputs[0] - - logits = self.qa_outputs(sequence_output) - start_logits, end_logits = logits.split(1, dim=-1) - start_logits = start_logits.squeeze(-1).contiguous() - end_logits = end_logits.squeeze(-1).contiguous() - - total_loss = None - if start_positions is not None and end_positions is not None: - # If we are on multi-GPU, split add a dimension - if len(start_positions.size()) > 1: - start_positions = start_positions.squeeze(-1) - if len(end_positions.size()) > 1: - end_positions = end_positions.squeeze(-1) - # sometimes the start/end positions are outside our model inputs, we ignore these terms - ignored_index = start_logits.size(1) - start_positions = start_positions.clamp(0, ignored_index) - end_positions = end_positions.clamp(0, ignored_index) - - loss_fct = CrossEntropyLoss(ignore_index=ignored_index) - start_loss = loss_fct(start_logits, start_positions) - end_loss = loss_fct(end_logits, end_positions) - total_loss = (start_loss + end_loss) / 2 - - if not return_dict: - output = ( - start_logits, - end_logits, - ) + outputs[1:] - return ((total_loss,) + output) if total_loss is not None else output - - return Seq2SeqQuestionAnsweringModelOutput( - loss=total_loss, - start_logits=start_logits, - end_logits=end_logits, - past_key_values=outputs.past_key_values, - decoder_hidden_states=outputs.decoder_hidden_states, - decoder_attentions=outputs.decoder_attentions, - cross_attentions=outputs.cross_attentions, - encoder_last_hidden_state=outputs.encoder_last_hidden_state, - encoder_hidden_states=outputs.encoder_hidden_states, - encoder_attentions=outputs.encoder_attentions, - ) - - -# Copied from transformers.models.bart.modeling_bart.BartDecoderWrapper with Bart->MBart -class UnimerMBartDecoderWrapper(UnimerMBartPreTrainedModel): - """ - This wrapper class is a helper class to correctly load pretrained checkpoints when the causal language model is - used in combination with the [`EncoderDecoderModel`] framework. - """ - - def __init__(self, config): - super().__init__(config) - self.decoder = UnimerMBartDecoder(config) - - def forward(self, *args, **kwargs): - return self.decoder(*args, **kwargs) - - -# Copied from transformers.models.bart.modeling_bart.BartForCausalLM with Bart->MBart, facebook/bart-base->facebook/mbart-large-cc25 -class UnimerMBartForCausalLM(UnimerMBartPreTrainedModel, GenerationMixin): - _tied_weights_keys = ["lm_head.weight"] - - def __init__(self, config): - config = copy.deepcopy(config) - config.is_decoder = True - config.is_encoder_decoder = False - super().__init__(config) - self.model = UnimerMBartDecoderWrapper(config) - - self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False) - - # Initialize weights and apply final processing - self.post_init() - - def get_input_embeddings(self): - return self.model.decoder.embed_tokens - - def set_input_embeddings(self, value): - self.model.decoder.embed_tokens = value - - def get_output_embeddings(self): - return self.lm_head - - def set_output_embeddings(self, new_embeddings): - self.lm_head = new_embeddings - - def set_decoder(self, decoder): - self.model.decoder = decoder - - def get_decoder(self): - return self.model.decoder - - @replace_return_docstrings(output_type=CausalLMOutputWithCrossAttentionsAndCounting, config_class=_CONFIG_FOR_DOC) - def forward( - self, - input_ids: torch.LongTensor = None, - attention_mask: Optional[torch.Tensor] = None, - encoder_hidden_states: Optional[torch.FloatTensor] = None, - encoder_attention_mask: Optional[torch.FloatTensor] = None, - head_mask: Optional[torch.Tensor] = None, - cross_attn_head_mask: Optional[torch.Tensor] = None, - past_key_values: Optional[List[torch.FloatTensor]] = None, - inputs_embeds: Optional[torch.FloatTensor] = None, - labels: Optional[torch.LongTensor] = None, - use_cache: Optional[bool] = None, - output_attentions: Optional[bool] = None, - output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - count_gt: Optional[torch.LongTensor] = None, - ) -> Union[Tuple, CausalLMOutputWithCrossAttentions]: - r""" - Args: - input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`): - Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you - provide it. - - Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and - [`PreTrainedTokenizer.__call__`] for details. - - [What are input IDs?](../glossary#input-ids) - attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*): - Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: - - - 1 for tokens that are **not masked**, - - 0 for tokens that are **masked**. - - [What are attention masks?](../glossary#attention-mask) - encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): - Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention - if the model is configured as a decoder. - encoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*): - Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used - in the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`: - head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*): - Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`: - - - 1 indicates the head is **not masked**, - - 0 indicates the head is **masked**. - - cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*): - Mask to nullify selected heads of the cross-attention modules. Mask values selected in `[0, 1]`: - - - 1 indicates the head is **not masked**, - - 0 indicates the head is **masked**. - - past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`): - Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of - shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of - shape `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`. The two additional - tensors are only required when the model is used as a decoder in a Sequence to Sequence model. - - Contains pre-computed hidden-states (key and values in the self-attention blocks and in the - cross-attention blocks) that can be used (see `past_key_values` input) to speed up sequential decoding. - - If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those - that don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of - all `decoder_input_ids` of shape `(batch_size, sequence_length)`. - labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): - Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., - config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored - (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`. - use_cache (`bool`, *optional*): - If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding - (see `past_key_values`). - - - 1 for tokens that are **not masked**, - - 0 for tokens that are **masked**. - output_attentions (`bool`, *optional*): - Whether or not to return the attentions tensors of all attention layers. See `attentions` under - returned tensors for more detail. - output_hidden_states (`bool`, *optional*): - Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors - for more detail. - return_dict (`bool`, *optional*): - Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. - - Returns: - - Example: - - ```python - >>> from transformers import AutoTokenizer, MBartForCausalLM - - >>> tokenizer = AutoTokenizer.from_pretrained("facebook/mbart-large-cc25") - >>> model = MBartForCausalLM.from_pretrained("facebook/mbart-large-cc25", add_cross_attention=False) - >>> assert model.config.is_decoder, f"{model.__class__} has to be configured as a decoder." - >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt") - >>> outputs = model(**inputs) - - >>> logits = outputs.logits - >>> expected_shape = [1, inputs.input_ids.shape[-1], model.config.vocab_size] - >>> list(logits.shape) == expected_shape - True - ```""" - - output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions - output_hidden_states = ( - output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states - ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - - count_pred = None - - # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn) - outputs = self.model.decoder( - input_ids=input_ids, - attention_mask=attention_mask, - count_pred=count_pred, - encoder_hidden_states=encoder_hidden_states, - encoder_attention_mask=encoder_attention_mask, - head_mask=head_mask, - cross_attn_head_mask=cross_attn_head_mask, - past_key_values=past_key_values, - inputs_embeds=inputs_embeds, - use_cache=use_cache, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, - return_dict=return_dict, - ) - - logits = self.lm_head(outputs[0]) - - loss = None - if labels is not None: - labels = labels.to(logits.device) - loss_fct = CrossEntropyLoss() - loss = loss_fct(logits.view(-1, self.config.vocab_size), labels.view(-1)) - - if not return_dict: - output = (logits,) + outputs[1:] - return (loss,) + output if loss is not None else output - - return CausalLMOutputWithCrossAttentionsAndCounting( - loss=loss, - logits=logits, - past_key_values=outputs.past_key_values, - hidden_states=outputs.hidden_states, - attentions=outputs.attentions, - cross_attentions=outputs.cross_attentions, - counting=count_pred, - ) - - def prepare_inputs_for_generation( - self, input_ids, past_key_values=None, attention_mask=None, use_cache=None, **kwargs - ): - # if model is used as a decoder in encoder-decoder model, the decoder attention mask is created on the fly - if attention_mask is None: - attention_mask = input_ids.new_ones(input_ids.shape) - - if past_key_values: - past_length = past_key_values[0][0].shape[2] - - # Some generation methods already pass only the last input ID - if input_ids.shape[1] > past_length: - remove_prefix_length = past_length - else: - # Default to old behavior: keep only final ID - remove_prefix_length = input_ids.shape[1] - 1 - - input_ids = input_ids[:, remove_prefix_length:] - # first step, decoder_cached_states are empty - return { - "input_ids": input_ids, # encoder_outputs is defined. input_ids not needed - "attention_mask": attention_mask, - "past_key_values": past_key_values, - "use_cache": use_cache, - } - - @staticmethod - def _reorder_cache(past_key_values, beam_idx): - reordered_past = () - for layer_past in past_key_values: - reordered_past += ( - tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past), - ) - return reordered_past diff --git a/magic_pdf/model/sub_modules/mfr/unimernet/unimernet_hf/unimer_mbart/tokenization_unimer_mbart.py b/magic_pdf/model/sub_modules/mfr/unimernet/unimernet_hf/unimer_mbart/tokenization_unimer_mbart.py deleted file mode 100644 index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000 diff --git a/magic_pdf/model/sub_modules/mfr/unimernet/unimernet_hf/unimer_swin/__init__.py b/magic_pdf/model/sub_modules/mfr/unimernet/unimernet_hf/unimer_swin/__init__.py deleted file mode 100644 index 0b91b3be3580e7b484deb4deae3dfe880e477906..0000000000000000000000000000000000000000 --- a/magic_pdf/model/sub_modules/mfr/unimernet/unimernet_hf/unimer_swin/__init__.py +++ /dev/null @@ -1,9 +0,0 @@ -from .configuration_unimer_swin import UnimerSwinConfig -from .modeling_unimer_swin import UnimerSwinModel -from .image_processing_unimer_swin import UnimerSwinImageProcessor - -__all__ = [ - "UnimerSwinConfig", - "UnimerSwinModel", - "UnimerSwinImageProcessor", -] diff --git a/magic_pdf/model/sub_modules/mfr/unimernet/unimernet_hf/unimer_swin/configuration_unimer_swin.py b/magic_pdf/model/sub_modules/mfr/unimernet/unimernet_hf/unimer_swin/configuration_unimer_swin.py deleted file mode 100644 index 6c577e7c98dc6a9813af7c56ba15f78232283679..0000000000000000000000000000000000000000 --- a/magic_pdf/model/sub_modules/mfr/unimernet/unimernet_hf/unimer_swin/configuration_unimer_swin.py +++ /dev/null @@ -1,132 +0,0 @@ -# coding=utf-8 -# Copyright 2022 The HuggingFace Inc. team. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Donut Swin Transformer model configuration""" - -from transformers.configuration_utils import PretrainedConfig -from transformers.utils import logging - - -logger = logging.get_logger(__name__) - - -class UnimerSwinConfig(PretrainedConfig): - r""" - This is the configuration class to store the configuration of a [`UnimerSwinModel`]. It is used to instantiate a - Donut model according to the specified arguments, defining the model architecture. Instantiating a configuration - with the defaults will yield a similar configuration to that of the Donut - [naver-clova-ix/donut-base](https://huggingface.co/naver-clova-ix/donut-base) architecture. - - Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the - documentation from [`PretrainedConfig`] for more information. - - Args: - image_size (`int`, *optional*, defaults to 224): - The size (resolution) of each image. - patch_size (`int`, *optional*, defaults to 4): - The size (resolution) of each patch. - num_channels (`int`, *optional*, defaults to 3): - The number of input channels. - embed_dim (`int`, *optional*, defaults to 96): - Dimensionality of patch embedding. - depths (`list(int)`, *optional*, defaults to `[2, 2, 6, 2]`): - Depth of each layer in the Transformer encoder. - num_heads (`list(int)`, *optional*, defaults to `[3, 6, 12, 24]`): - Number of attention heads in each layer of the Transformer encoder. - window_size (`int`, *optional*, defaults to 7): - Size of windows. - mlp_ratio (`float`, *optional*, defaults to 4.0): - Ratio of MLP hidden dimensionality to embedding dimensionality. - qkv_bias (`bool`, *optional*, defaults to `True`): - Whether or not a learnable bias should be added to the queries, keys and values. - hidden_dropout_prob (`float`, *optional*, defaults to 0.0): - The dropout probability for all fully connected layers in the embeddings and encoder. - attention_probs_dropout_prob (`float`, *optional*, defaults to 0.0): - The dropout ratio for the attention probabilities. - drop_path_rate (`float`, *optional*, defaults to 0.1): - Stochastic depth rate. - hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`): - The non-linear activation function (function or string) in the encoder. If string, `"gelu"`, `"relu"`, - `"selu"` and `"gelu_new"` are supported. - use_absolute_embeddings (`bool`, *optional*, defaults to `False`): - Whether or not to add absolute position embeddings to the patch embeddings. - initializer_range (`float`, *optional*, defaults to 0.02): - The standard deviation of the truncated_normal_initializer for initializing all weight matrices. - layer_norm_eps (`float`, *optional*, defaults to 1e-05): - The epsilon used by the layer normalization layers. - - Example: - - ```python - >>> from transformers import UnimerSwinConfig, UnimerSwinModel - - >>> # Initializing a Donut naver-clova-ix/donut-base style configuration - >>> configuration = UnimerSwinConfig() - - >>> # Randomly initializing a model from the naver-clova-ix/donut-base style configuration - >>> model = UnimerSwinModel(configuration) - - >>> # Accessing the model configuration - >>> configuration = model.config - ```""" - - model_type = "unimer-swin" - - attribute_map = { - "num_attention_heads": "num_heads", - "num_hidden_layers": "num_layers", - } - - def __init__( - self, - image_size=224, - patch_size=4, - num_channels=3, - embed_dim=96, - depths=[2, 2, 6, 2], - num_heads=[3, 6, 12, 24], - window_size=7, - mlp_ratio=4.0, - qkv_bias=True, - hidden_dropout_prob=0.0, - attention_probs_dropout_prob=0.0, - drop_path_rate=0.1, - hidden_act="gelu", - use_absolute_embeddings=False, - initializer_range=0.02, - layer_norm_eps=1e-5, - **kwargs, - ): - super().__init__(**kwargs) - - self.image_size = image_size - self.patch_size = patch_size - self.num_channels = num_channels - self.embed_dim = embed_dim - self.depths = depths - self.num_layers = len(depths) - self.num_heads = num_heads - self.window_size = window_size - self.mlp_ratio = mlp_ratio - self.qkv_bias = qkv_bias - self.hidden_dropout_prob = hidden_dropout_prob - self.attention_probs_dropout_prob = attention_probs_dropout_prob - self.drop_path_rate = drop_path_rate - self.hidden_act = hidden_act - self.use_absolute_embeddings = use_absolute_embeddings - self.layer_norm_eps = layer_norm_eps - self.initializer_range = initializer_range - # we set the hidden_size attribute in order to make Swin work with VisionEncoderDecoderModel - # this indicates the channel dimension after the last stage of the model - self.hidden_size = int(embed_dim * 2 ** (len(depths) - 1)) diff --git a/magic_pdf/model/sub_modules/mfr/unimernet/unimernet_hf/unimer_swin/image_processing_unimer_swin.py b/magic_pdf/model/sub_modules/mfr/unimernet/unimernet_hf/unimer_swin/image_processing_unimer_swin.py deleted file mode 100644 index a16d2433751d294bf1aed2022c466cc6dbaef15b..0000000000000000000000000000000000000000 --- a/magic_pdf/model/sub_modules/mfr/unimernet/unimernet_hf/unimer_swin/image_processing_unimer_swin.py +++ /dev/null @@ -1,132 +0,0 @@ -from transformers.image_processing_utils import BaseImageProcessor -import numpy as np -import cv2 -import albumentations as alb -from albumentations.pytorch import ToTensorV2 - - -# TODO: dereference cv2 if possible -class UnimerSwinImageProcessor(BaseImageProcessor): - def __init__( - self, - image_size = (192, 672), - ): - self.input_size = [int(_) for _ in image_size] - assert len(self.input_size) == 2 - - self.transform = alb.Compose( - [ - alb.ToGray(), - alb.Normalize((0.7931, 0.7931, 0.7931), (0.1738, 0.1738, 0.1738)), - # alb.Sharpen() - ToTensorV2(), - ] - ) - - def __call__(self, item): - image = self.prepare_input(item) - return self.transform(image=image)['image'][:1] - - @staticmethod - def crop_margin_numpy(img: np.ndarray) -> np.ndarray: - """Crop margins of image using NumPy operations""" - # Convert to grayscale if it's a color image - if len(img.shape) == 3 and img.shape[2] == 3: - gray = cv2.cvtColor(img, cv2.COLOR_RGB2GRAY) - else: - gray = img.copy() - - # Normalize and threshold - if gray.max() == gray.min(): - return img - - normalized = (((gray - gray.min()) / (gray.max() - gray.min())) * 255).astype(np.uint8) - binary = 255 * (normalized < 200).astype(np.uint8) - - # Find bounding box - coords = cv2.findNonZero(binary) # Find all non-zero points (text) - x, y, w, h = cv2.boundingRect(coords) # Find minimum spanning bounding box - - # Return cropped image - return img[y:y + h, x:x + w] - - def prepare_input(self, img, random_padding: bool = False): - """ - Convert PIL Image or numpy array to properly sized and padded image after: - - crop margins - - resize while maintaining aspect ratio - - pad to target size - """ - if img is None: - return None - - # try: - # img = self.crop_margin_numpy(img) - # except Exception: - # # might throw an error for broken files - # return None - - if img.shape[0] == 0 or img.shape[1] == 0: - return None - - # Get current dimensions - h, w = img.shape[:2] - target_h, target_w = self.input_size - - # Calculate scale to preserve aspect ratio (equivalent to resize + thumbnail) - scale = min(target_h / h, target_w / w) - - # Calculate new dimensions - new_h, new_w = int(h * scale), int(w * scale) - - # Resize the image while preserving aspect ratio - resized_img = cv2.resize(img, (new_w, new_h)) - - # Calculate padding values using the existing method - delta_width = target_w - new_w - delta_height = target_h - new_h - - pad_width, pad_height = self._get_padding_values(new_w, new_h, random_padding) - - # Apply padding (convert PIL padding format to OpenCV format) - padding_color = [0, 0, 0] if len(img.shape) == 3 else [0] - - padded_img = cv2.copyMakeBorder( - resized_img, - pad_height, # top - delta_height - pad_height, # bottom - pad_width, # left - delta_width - pad_width, # right - cv2.BORDER_CONSTANT, - value=padding_color - ) - - return padded_img - - def _calculate_padding(self, new_w, new_h, random_padding): - """Calculate padding values for PIL images""" - delta_width = self.input_size[1] - new_w - delta_height = self.input_size[0] - new_h - - pad_width, pad_height = self._get_padding_values(new_w, new_h, random_padding) - - return ( - pad_width, - pad_height, - delta_width - pad_width, - delta_height - pad_height, - ) - - def _get_padding_values(self, new_w, new_h, random_padding): - """Get padding values based on image dimensions and padding strategy""" - delta_width = self.input_size[1] - new_w - delta_height = self.input_size[0] - new_h - - if random_padding: - pad_width = np.random.randint(low=0, high=delta_width + 1) - pad_height = np.random.randint(low=0, high=delta_height + 1) - else: - pad_width = delta_width // 2 - pad_height = delta_height // 2 - - return pad_width, pad_height diff --git a/magic_pdf/model/sub_modules/mfr/unimernet/unimernet_hf/unimer_swin/modeling_unimer_swin.py b/magic_pdf/model/sub_modules/mfr/unimernet/unimernet_hf/unimer_swin/modeling_unimer_swin.py deleted file mode 100644 index 1b808e8bdc2b2c760598ca5d0dbd2705e42f1072..0000000000000000000000000000000000000000 --- a/magic_pdf/model/sub_modules/mfr/unimernet/unimernet_hf/unimer_swin/modeling_unimer_swin.py +++ /dev/null @@ -1,1084 +0,0 @@ -# coding=utf-8 -# Copyright 2022 The HuggingFace Inc. team. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""PyTorch UnimerSwin Transformer model. - -This implementation is identical to a regular Swin Transformer, without final layer norm on top of the final hidden -states.""" - -import collections.abc -import math -from dataclasses import dataclass -from typing import Optional, Tuple, Union - -import torch -import torch.utils.checkpoint -from torch import nn - -from transformers.activations import ACT2FN -from transformers.modeling_utils import PreTrainedModel -from transformers.pytorch_utils import find_pruneable_heads_and_indices, meshgrid, prune_linear_layer -from transformers.utils import ( - ModelOutput, - add_code_sample_docstrings, - add_start_docstrings, - add_start_docstrings_to_model_forward, - logging, - torch_int, -) -from .configuration_unimer_swin import UnimerSwinConfig - - -logger = logging.get_logger(__name__) - -# General docstring -_CONFIG_FOR_DOC = "UnimerSwinConfig" - -# Base docstring -_CHECKPOINT_FOR_DOC = "https://huggingface.co/naver-clova-ix/donut-base" -_EXPECTED_OUTPUT_SHAPE = [1, 49, 768] - - -@dataclass -# Copied from transformers.models.swin.modeling_swin.SwinEncoderOutput with Swin->UnimerSwin -class UnimerSwinEncoderOutput(ModelOutput): - """ - UnimerSwin encoder's outputs, with potential hidden states and attentions. - - Args: - last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`): - Sequence of hidden-states at the output of the last layer of the model. - hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): - Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of - shape `(batch_size, sequence_length, hidden_size)`. - - Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): - Tuple of `torch.FloatTensor` (one for each stage) of shape `(batch_size, num_heads, sequence_length, - sequence_length)`. - - Attentions weights after the attention softmax, used to compute the weighted average in the self-attention - heads. - reshaped_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): - Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of - shape `(batch_size, hidden_size, height, width)`. - - Hidden-states of the model at the output of each layer plus the initial embedding outputs reshaped to - include the spatial dimensions. - """ - - last_hidden_state: torch.FloatTensor = None - hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None - attentions: Optional[Tuple[torch.FloatTensor, ...]] = None - reshaped_hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None - - -@dataclass -# Copied from transformers.models.swin.modeling_swin.SwinModelOutput with Swin->UnimerSwin -class UnimerSwinModelOutput(ModelOutput): - """ - UnimerSwin model's outputs that also contains a pooling of the last hidden states. - - Args: - last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`): - Sequence of hidden-states at the output of the last layer of the model. - pooler_output (`torch.FloatTensor` of shape `(batch_size, hidden_size)`, *optional*, returned when `add_pooling_layer=True` is passed): - Average pooling of the last layer hidden-state. - hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): - Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of - shape `(batch_size, sequence_length, hidden_size)`. - - Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): - Tuple of `torch.FloatTensor` (one for each stage) of shape `(batch_size, num_heads, sequence_length, - sequence_length)`. - - Attentions weights after the attention softmax, used to compute the weighted average in the self-attention - heads. - reshaped_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): - Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of - shape `(batch_size, hidden_size, height, width)`. - - Hidden-states of the model at the output of each layer plus the initial embedding outputs reshaped to - include the spatial dimensions. - """ - - last_hidden_state: torch.FloatTensor = None - pooler_output: Optional[torch.FloatTensor] = None - hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None - attentions: Optional[Tuple[torch.FloatTensor, ...]] = None - reshaped_hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None - - -# Copied from transformers.models.swin.modeling_swin.window_partition -def window_partition(input_feature, window_size): - """ - Partitions the given input into windows. - """ - batch_size, height, width, num_channels = input_feature.shape - input_feature = input_feature.view( - batch_size, height // window_size, window_size, width // window_size, window_size, num_channels - ) - windows = input_feature.permute(0, 1, 3, 2, 4, 5).contiguous().view(-1, window_size, window_size, num_channels) - return windows - - -# Copied from transformers.models.swin.modeling_swin.window_reverse -def window_reverse(windows, window_size, height, width): - """ - Merges windows to produce higher resolution features. - """ - num_channels = windows.shape[-1] - windows = windows.view(-1, height // window_size, width // window_size, window_size, window_size, num_channels) - windows = windows.permute(0, 1, 3, 2, 4, 5).contiguous().view(-1, height, width, num_channels) - return windows - - -# Copied from transformers.models.swin.modeling_swin.SwinEmbeddings with Swin->UnimerSwin -class UnimerSwinEmbeddings(nn.Module): - """ - Construct the patch and position embeddings. Optionally, also the mask token. - """ - - def __init__(self, config, use_mask_token=False): - super().__init__() - - self.patch_embeddings = UnimerSwinPatchEmbeddings(config) - num_patches = self.patch_embeddings.num_patches - self.patch_grid = self.patch_embeddings.grid_size - self.mask_token = nn.Parameter(torch.zeros(1, 1, config.embed_dim)) if use_mask_token else None - - if config.use_absolute_embeddings: - self.position_embeddings = nn.Parameter(torch.zeros(1, num_patches + 1, config.embed_dim)) - else: - self.position_embeddings = None - - ### code added. ### - if config.use_2d_embeddings: - self.row_embeddings = nn.Parameter(torch.zeros(1, self.patch_grid[0] + 1, config.embed_dim)) - self.column_embeddings = nn.Parameter(torch.zeros(1, self.patch_grid[1] + 1, config.embed_dim)) - else: - self.row_embeddings = None - self.column_embeddings = None - ###### - - self.norm = nn.LayerNorm(config.embed_dim) - self.dropout = nn.Dropout(config.hidden_dropout_prob) - - def interpolate_pos_encoding(self, embeddings: torch.Tensor, height: int, width: int) -> torch.Tensor: - """ - This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher - resolution images. - - Source: - https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174 - """ - - num_patches = embeddings.shape[1] - 1 - num_positions = self.position_embeddings.shape[1] - 1 - if num_patches == num_positions and height == width: - return self.position_embeddings - class_pos_embed = self.position_embeddings[:, 0] - patch_pos_embed = self.position_embeddings[:, 1:] - dim = embeddings.shape[-1] - h0 = height // self.config.patch_size - w0 = width // self.config.patch_size - # we add a small number to avoid floating point error in the interpolation - # see discussion at https://github.com/facebookresearch/dino/issues/8 - h0, w0 = h0 + 0.1, w0 + 0.1 - patch_pos_embed = patch_pos_embed.reshape(1, int(math.sqrt(num_positions)), int(math.sqrt(num_positions)), dim) - patch_pos_embed = patch_pos_embed.permute(0, 3, 1, 2) - patch_pos_embed = nn.functional.interpolate( - patch_pos_embed, - scale_factor=(h0 / math.sqrt(num_positions), w0 / math.sqrt(num_positions)), - mode="bicubic", - align_corners=False, - ) - patch_pos_embed = patch_pos_embed.permute(0, 2, 3, 1).view(1, -1, dim) - return torch.cat((class_pos_embed.unsqueeze(0), patch_pos_embed), dim=1) - - def forward( - self, - pixel_values: Optional[torch.FloatTensor], - bool_masked_pos: Optional[torch.BoolTensor] = None, - interpolate_pos_encoding: bool = False, - ) -> Tuple[torch.Tensor]: - _, num_channels, height, width = pixel_values.shape - embeddings, output_dimensions = self.patch_embeddings(pixel_values) - embeddings = self.norm(embeddings) - batch_size, seq_len, _ = embeddings.size() - - if bool_masked_pos is not None: - mask_tokens = self.mask_token.expand(batch_size, seq_len, -1) - # replace the masked visual tokens by mask_tokens - mask = bool_masked_pos.unsqueeze(-1).type_as(mask_tokens) - embeddings = embeddings * (1.0 - mask) + mask_tokens * mask - - if self.position_embeddings is not None: - # if interpolate_pos_encoding: - # embeddings = embeddings + self.interpolate_pos_encoding(embeddings, height, width) - # else: - # embeddings = embeddings + self.position_embeddings - embeddings = embeddings + self.position_embeddings[:, :seq_len, :] # code edited. - - ### code added. ### - if self.row_embeddings is not None and self.column_embeddings is not None: - # Repeat the x position embeddings across the y axis like 0, 1, 2, 3, 0, 1, 2, 3, ... - row_embeddings = self.row_embeddings[:, :output_dimensions[0], :].repeat_interleave(output_dimensions[1], dim=1) - column_embeddings = self.column_embeddings[:, :output_dimensions[1], :].repeat(1, output_dimensions[0], 1) - embeddings = embeddings + row_embeddings + column_embeddings - ###### - - embeddings = self.dropout(embeddings) - - return embeddings, output_dimensions - -class StemLayer(nn.Module): - r""" Stem layer of InternImage - Args: - in_chans (int): number of input channels - out_chans (int): number of output channels - act_layer (str): activation layer - norm_layer (str): normalization layer - """ - - def __init__(self, in_chans=3, out_chans=96, act_layer=nn.GELU, norm_layer='BN'): - super().__init__() - self.conv1 = nn.Conv2d(in_chans, out_chans // 2, kernel_size=3, stride=2, padding=1) - self.norm1 = self.build_norm_layer(out_chans // 2, norm_layer) - self.act = act_layer() - self.conv2 = nn.Conv2d(out_chans // 2, out_chans, kernel_size=3, stride=2, padding=1) - - def build_norm_layer(self, dim, norm_layer): - layers = [] - if norm_layer == 'BN': - layers.append(nn.BatchNorm2d(dim)) - else: - raise NotImplementedError(f'build_norm_layer does not support {norm_layer}') - return nn.Sequential(*layers) - - def forward(self, x): - x = self.conv1(x) - x = self.norm1(x) - x = self.act(x) - x = self.conv2(x) - return x - -# Copied from transformers.models.swin.modeling_swin.SwinPatchEmbeddings with Swin->UnimerSwin -class UnimerSwinPatchEmbeddings(nn.Module): - """ - This class turns `pixel_values` of shape `(batch_size, num_channels, height, width)` into the initial - `hidden_states` (patch embeddings) of shape `(batch_size, seq_length, hidden_size)` to be consumed by a - Transformer. - """ - - def __init__(self, config): - super().__init__() - image_size, patch_size = config.image_size, config.patch_size - num_channels, hidden_size = config.num_channels, config.embed_dim - image_size = image_size if isinstance(image_size, collections.abc.Iterable) else (image_size, image_size) - patch_size = patch_size if isinstance(patch_size, collections.abc.Iterable) else (patch_size, patch_size) - num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0]) - self.image_size = image_size - self.patch_size = patch_size - self.num_channels = num_channels - self.num_patches = num_patches - self.grid_size = (image_size[0] // patch_size[0], image_size[1] // patch_size[1]) - - ### code edited. ### - # self.projection = nn.Conv2d(num_channels, hidden_size, kernel_size=patch_size, stride=patch_size) - self.projection = StemLayer(in_chans=num_channels, out_chans=hidden_size) - ### - - def maybe_pad(self, pixel_values, height, width): - if width % self.patch_size[1] != 0: - pad_values = (0, self.patch_size[1] - width % self.patch_size[1]) - pixel_values = nn.functional.pad(pixel_values, pad_values) - if height % self.patch_size[0] != 0: - pad_values = (0, 0, 0, self.patch_size[0] - height % self.patch_size[0]) - pixel_values = nn.functional.pad(pixel_values, pad_values) - return pixel_values - - def forward(self, pixel_values: Optional[torch.FloatTensor]) -> Tuple[torch.Tensor, Tuple[int]]: - _, num_channels, height, width = pixel_values.shape - # pad the input to be divisible by self.patch_size, if needed - pixel_values = self.maybe_pad(pixel_values, height, width) - embeddings = self.projection(pixel_values) - _, _, height, width = embeddings.shape - output_dimensions = (height, width) - embeddings = embeddings.flatten(2).transpose(1, 2) - - return embeddings, output_dimensions - - -# Copied from transformers.models.swin.modeling_swin.SwinPatchMerging -class UnimerSwinPatchMerging(nn.Module): - """ - Patch Merging Layer. - - Args: - input_resolution (`Tuple[int]`): - Resolution of input feature. - dim (`int`): - Number of input channels. - norm_layer (`nn.Module`, *optional*, defaults to `nn.LayerNorm`): - Normalization layer class. - """ - - def __init__(self, input_resolution: Tuple[int], dim: int, norm_layer: nn.Module = nn.LayerNorm) -> None: - super().__init__() - self.input_resolution = input_resolution - self.dim = dim - self.reduction = nn.Linear(4 * dim, 2 * dim, bias=False) - self.norm = norm_layer(4 * dim) - - def maybe_pad(self, input_feature, height, width): - should_pad = (height % 2 == 1) or (width % 2 == 1) - if should_pad: - pad_values = (0, 0, 0, width % 2, 0, height % 2) - input_feature = nn.functional.pad(input_feature, pad_values) - - return input_feature - - def forward(self, input_feature: torch.Tensor, input_dimensions: Tuple[int, int]) -> torch.Tensor: - height, width = input_dimensions - # `dim` is height * width - batch_size, dim, num_channels = input_feature.shape - - input_feature = input_feature.view(batch_size, height, width, num_channels) - # pad input to be disible by width and height, if needed - input_feature = self.maybe_pad(input_feature, height, width) - # [batch_size, height/2, width/2, num_channels] - input_feature_0 = input_feature[:, 0::2, 0::2, :] - # [batch_size, height/2, width/2, num_channels] - input_feature_1 = input_feature[:, 1::2, 0::2, :] - # [batch_size, height/2, width/2, num_channels] - input_feature_2 = input_feature[:, 0::2, 1::2, :] - # [batch_size, height/2, width/2, num_channels] - input_feature_3 = input_feature[:, 1::2, 1::2, :] - # batch_size height/2 width/2 4*num_channels - input_feature = torch.cat([input_feature_0, input_feature_1, input_feature_2, input_feature_3], -1) - input_feature = input_feature.view(batch_size, -1, 4 * num_channels) # batch_size height/2*width/2 4*C - - input_feature = self.norm(input_feature) - input_feature = self.reduction(input_feature) - - return input_feature - - -# Copied from transformers.models.beit.modeling_beit.drop_path -def drop_path(input: torch.Tensor, drop_prob: float = 0.0, training: bool = False) -> torch.Tensor: - """ - Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks). - - Comment by Ross Wightman: This is the same as the DropConnect impl I created for EfficientNet, etc networks, - however, the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper... - See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for changing the - layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use 'survival rate' as the - argument. - """ - if drop_prob == 0.0 or not training: - return input - keep_prob = 1 - drop_prob - shape = (input.shape[0],) + (1,) * (input.ndim - 1) # work with diff dim tensors, not just 2D ConvNets - random_tensor = keep_prob + torch.rand(shape, dtype=input.dtype, device=input.device) - random_tensor.floor_() # binarize - output = input.div(keep_prob) * random_tensor - return output - - -# Copied from transformers.models.swin.modeling_swin.SwinDropPath -class UnimerSwinDropPath(nn.Module): - """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).""" - - def __init__(self, drop_prob: Optional[float] = None) -> None: - super().__init__() - self.drop_prob = drop_prob - - def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: - return drop_path(hidden_states, self.drop_prob, self.training) - - def extra_repr(self) -> str: - return "p={}".format(self.drop_prob) - - -# Copied from transformers.models.swin.modeling_swin.SwinSelfAttention with Swin->UnimerSwin -class UnimerSwinSelfAttention(nn.Module): - def __init__(self, config, dim, num_heads, window_size): - super().__init__() - if dim % num_heads != 0: - raise ValueError( - f"The hidden size ({dim}) is not a multiple of the number of attention heads ({num_heads})" - ) - - self.num_attention_heads = num_heads - self.attention_head_size = int(dim / num_heads) - self.all_head_size = self.num_attention_heads * self.attention_head_size - self.window_size = ( - window_size if isinstance(window_size, collections.abc.Iterable) else (window_size, window_size) - ) - - self.relative_position_bias_table = nn.Parameter( - torch.zeros((2 * self.window_size[0] - 1) * (2 * self.window_size[1] - 1), num_heads) - ) - - # get pair-wise relative position index for each token inside the window - coords_h = torch.arange(self.window_size[0]) - coords_w = torch.arange(self.window_size[1]) - coords = torch.stack(meshgrid([coords_h, coords_w], indexing="ij")) - coords_flatten = torch.flatten(coords, 1) - relative_coords = coords_flatten[:, :, None] - coords_flatten[:, None, :] - relative_coords = relative_coords.permute(1, 2, 0).contiguous() - relative_coords[:, :, 0] += self.window_size[0] - 1 - relative_coords[:, :, 1] += self.window_size[1] - 1 - relative_coords[:, :, 0] *= 2 * self.window_size[1] - 1 - relative_position_index = relative_coords.sum(-1) - self.register_buffer("relative_position_index", relative_position_index) - - self.query = nn.Linear(self.all_head_size, self.all_head_size, bias=config.qkv_bias) - self.key = nn.Linear(self.all_head_size, self.all_head_size, bias=config.qkv_bias) - self.value = nn.Linear(self.all_head_size, self.all_head_size, bias=config.qkv_bias) - - self.dropout = nn.Dropout(config.attention_probs_dropout_prob) - - def transpose_for_scores(self, x): - new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size) - x = x.view(new_x_shape) - return x.permute(0, 2, 1, 3) - - def forward( - self, - hidden_states: torch.Tensor, - attention_mask: Optional[torch.FloatTensor] = None, - head_mask: Optional[torch.FloatTensor] = None, - output_attentions: Optional[bool] = False, - ) -> Tuple[torch.Tensor]: - batch_size, dim, num_channels = hidden_states.shape - mixed_query_layer = self.query(hidden_states) - - key_layer = self.transpose_for_scores(self.key(hidden_states)) - value_layer = self.transpose_for_scores(self.value(hidden_states)) - query_layer = self.transpose_for_scores(mixed_query_layer) - - # Take the dot product between "query" and "key" to get the raw attention scores. - attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2)) - - attention_scores = attention_scores / math.sqrt(self.attention_head_size) - - relative_position_bias = self.relative_position_bias_table[self.relative_position_index.view(-1)] - relative_position_bias = relative_position_bias.view( - self.window_size[0] * self.window_size[1], self.window_size[0] * self.window_size[1], -1 - ) - - relative_position_bias = relative_position_bias.permute(2, 0, 1).contiguous() - attention_scores = attention_scores + relative_position_bias.unsqueeze(0) - - if attention_mask is not None: - # Apply the attention mask is (precomputed for all layers in UnimerSwinModel forward() function) - mask_shape = attention_mask.shape[0] - attention_scores = attention_scores.view( - batch_size // mask_shape, mask_shape, self.num_attention_heads, dim, dim - ) - attention_scores = attention_scores + attention_mask.unsqueeze(1).unsqueeze(0) - attention_scores = attention_scores.view(-1, self.num_attention_heads, dim, dim) - - # Normalize the attention scores to probabilities. - attention_probs = nn.functional.softmax(attention_scores, dim=-1) - - # This is actually dropping out entire tokens to attend to, which might - # seem a bit unusual, but is taken from the original Transformer paper. - attention_probs = self.dropout(attention_probs) - - # Mask heads if we want to - if head_mask is not None: - attention_probs = attention_probs * head_mask - - context_layer = torch.matmul(attention_probs, value_layer) - context_layer = context_layer.permute(0, 2, 1, 3).contiguous() - new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,) - context_layer = context_layer.view(new_context_layer_shape) - - outputs = (context_layer, attention_probs) if output_attentions else (context_layer,) - - return outputs - - -# Copied from transformers.models.swin.modeling_swin.SwinSelfOutput -class UnimerSwinSelfOutput(nn.Module): - def __init__(self, config, dim): - super().__init__() - self.dense = nn.Linear(dim, dim) - self.dropout = nn.Dropout(config.attention_probs_dropout_prob) - - def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor: - hidden_states = self.dense(hidden_states) - hidden_states = self.dropout(hidden_states) - - return hidden_states - - -# Copied from transformers.models.swin.modeling_swin.SwinAttention with Swin->UnimerSwin -class UnimerSwinAttention(nn.Module): - def __init__(self, config, dim, num_heads, window_size): - super().__init__() - self.self = UnimerSwinSelfAttention(config, dim, num_heads, window_size) - self.output = UnimerSwinSelfOutput(config, dim) - self.pruned_heads = set() - - def prune_heads(self, heads): - if len(heads) == 0: - return - heads, index = find_pruneable_heads_and_indices( - heads, self.self.num_attention_heads, self.self.attention_head_size, self.pruned_heads - ) - - # Prune linear layers - self.self.query = prune_linear_layer(self.self.query, index) - self.self.key = prune_linear_layer(self.self.key, index) - self.self.value = prune_linear_layer(self.self.value, index) - self.output.dense = prune_linear_layer(self.output.dense, index, dim=1) - - # Update hyper params and store pruned heads - self.self.num_attention_heads = self.self.num_attention_heads - len(heads) - self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads - self.pruned_heads = self.pruned_heads.union(heads) - - def forward( - self, - hidden_states: torch.Tensor, - attention_mask: Optional[torch.FloatTensor] = None, - head_mask: Optional[torch.FloatTensor] = None, - output_attentions: Optional[bool] = False, - ) -> Tuple[torch.Tensor]: - self_outputs = self.self(hidden_states, attention_mask, head_mask, output_attentions) - attention_output = self.output(self_outputs[0], hidden_states) - outputs = (attention_output,) + self_outputs[1:] # add attentions if we output them - return outputs - - -# Copied from transformers.models.swin.modeling_swin.SwinIntermediate -class UnimerSwinIntermediate(nn.Module): - def __init__(self, config, dim): - super().__init__() - self.dense = nn.Linear(dim, int(config.mlp_ratio * dim)) - if isinstance(config.hidden_act, str): - self.intermediate_act_fn = ACT2FN[config.hidden_act] - else: - self.intermediate_act_fn = config.hidden_act - - def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: - hidden_states = self.dense(hidden_states) - hidden_states = self.intermediate_act_fn(hidden_states) - return hidden_states - - -# Copied from transformers.models.swin.modeling_swin.SwinOutput -class UnimerSwinOutput(nn.Module): - def __init__(self, config, dim): - super().__init__() - self.dense = nn.Linear(int(config.mlp_ratio * dim), dim) - self.dropout = nn.Dropout(config.hidden_dropout_prob) - - def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: - hidden_states = self.dense(hidden_states) - hidden_states = self.dropout(hidden_states) - return hidden_states - - -class ConvEnhance(nn.Module): - """Depth-wise convolution to get the positional information. - """ - def __init__(self, config, dim, k=3): - super(ConvEnhance, self).__init__() - self.proj = nn.Conv2d(dim, - dim, - (k,k), - (1,1), - (k // 2,k // 2), - groups=dim) - self.act_fn = ACT2FN[config.hidden_act] - - def forward(self, x, size: Tuple[int, int]): - B, N, C = x.shape - H, W = size - assert N == H * W - - feat = x.transpose(1, 2).view(B, C, H, W) - feat = self.proj(feat) - feat = self.act_fn(feat) - feat = feat.flatten(2).transpose(1, 2) - - x = x + feat - return x - - -# Copied from transformers.models.swin.modeling_swin.SwinLayer with Swin->UnimerSwin -class UnimerSwinLayer(nn.Module): - def __init__(self, config, dim, input_resolution, num_heads, shift_size=0): - super().__init__() - self.chunk_size_feed_forward = config.chunk_size_feed_forward - self.shift_size = shift_size - self.window_size = config.window_size - self.input_resolution = input_resolution - self.layernorm_before = nn.LayerNorm(dim, eps=config.layer_norm_eps) - - self.ce = nn.ModuleList([ConvEnhance(config, dim=dim, k=3), - ConvEnhance(config, dim=dim, k=3)]) - - self.attention = UnimerSwinAttention(config, dim, num_heads, window_size=self.window_size) - self.drop_path = UnimerSwinDropPath(config.drop_path_rate) if config.drop_path_rate > 0.0 else nn.Identity() - self.layernorm_after = nn.LayerNorm(dim, eps=config.layer_norm_eps) - self.intermediate = UnimerSwinIntermediate(config, dim) - self.output = UnimerSwinOutput(config, dim) - - def set_shift_and_window_size(self, input_resolution): - if min(input_resolution) <= self.window_size: - # if window size is larger than input resolution, we don't partition windows - self.shift_size = torch_int(0) - self.window_size = ( - torch.min(torch.tensor(input_resolution)) if torch.jit.is_tracing() else min(input_resolution) - ) - - def get_attn_mask(self, height, width, dtype, device): - if self.shift_size > 0: - # calculate attention mask for SW-MSA - img_mask = torch.zeros((1, height, width, 1), dtype=dtype, device=device) - height_slices = ( - slice(0, -self.window_size), - slice(-self.window_size, -self.shift_size), - slice(-self.shift_size, None), - ) - width_slices = ( - slice(0, -self.window_size), - slice(-self.window_size, -self.shift_size), - slice(-self.shift_size, None), - ) - count = 0 - for height_slice in height_slices: - for width_slice in width_slices: - img_mask[:, height_slice, width_slice, :] = count - count += 1 - - mask_windows = window_partition(img_mask, self.window_size) - mask_windows = mask_windows.view(-1, self.window_size * self.window_size) - attn_mask = mask_windows.unsqueeze(1) - mask_windows.unsqueeze(2) - attn_mask = attn_mask.masked_fill(attn_mask != 0, float(-100.0)).masked_fill(attn_mask == 0, float(0.0)) - else: - attn_mask = None - return attn_mask - - def maybe_pad(self, hidden_states, height, width): - pad_right = (self.window_size - width % self.window_size) % self.window_size - pad_bottom = (self.window_size - height % self.window_size) % self.window_size - pad_values = (0, 0, 0, pad_right, 0, pad_bottom) - hidden_states = nn.functional.pad(hidden_states, pad_values) - return hidden_states, pad_values - - def forward( - self, - hidden_states: torch.Tensor, - input_dimensions: Tuple[int, int], - head_mask: Optional[torch.FloatTensor] = None, - output_attentions: Optional[bool] = False, - always_partition: Optional[bool] = False, - ) -> Tuple[torch.Tensor, torch.Tensor]: - if not always_partition: - self.set_shift_and_window_size(input_dimensions) - else: - pass - height, width = input_dimensions - batch_size, _, channels = hidden_states.size() - - - - hidden_states = self.ce[0](hidden_states, input_dimensions) - shortcut = hidden_states - - - hidden_states = self.layernorm_before(hidden_states) - hidden_states = hidden_states.view(batch_size, height, width, channels) - - # pad hidden_states to multiples of window size - hidden_states, pad_values = self.maybe_pad(hidden_states, height, width) - - _, height_pad, width_pad, _ = hidden_states.shape - # cyclic shift - if self.shift_size > 0: - shifted_hidden_states = torch.roll(hidden_states, shifts=(-self.shift_size, -self.shift_size), dims=(1, 2)) - else: - shifted_hidden_states = hidden_states - - # partition windows - hidden_states_windows = window_partition(shifted_hidden_states, self.window_size) - hidden_states_windows = hidden_states_windows.view(-1, self.window_size * self.window_size, channels) - attn_mask = self.get_attn_mask( - height_pad, width_pad, dtype=hidden_states.dtype, device=hidden_states_windows.device - ) - - attention_outputs = self.attention( - hidden_states_windows, attn_mask, head_mask, output_attentions=output_attentions - ) - - attention_output = attention_outputs[0] - - attention_windows = attention_output.view(-1, self.window_size, self.window_size, channels) - shifted_windows = window_reverse(attention_windows, self.window_size, height_pad, width_pad) - - # reverse cyclic shift - if self.shift_size > 0: - attention_windows = torch.roll(shifted_windows, shifts=(self.shift_size, self.shift_size), dims=(1, 2)) - else: - attention_windows = shifted_windows - - was_padded = pad_values[3] > 0 or pad_values[5] > 0 - if was_padded: - attention_windows = attention_windows[:, :height, :width, :].contiguous() - - attention_windows = attention_windows.view(batch_size, height * width, channels) - - hidden_states = shortcut + self.drop_path(attention_windows) - - - - hidden_states = self.ce[1](hidden_states, input_dimensions) - layer_output = self.layernorm_after(hidden_states) - layer_output = self.intermediate(layer_output) - layer_output = hidden_states + self.output(layer_output) - - layer_outputs = (layer_output, attention_outputs[1]) if output_attentions else (layer_output,) - return layer_outputs - - -# Copied from transformers.models.swin.modeling_swin.SwinStage with Swin->UnimerSwin -class UnimerSwinStage(nn.Module): - def __init__(self, config, dim, input_resolution, depth, num_heads, drop_path, downsample): - super().__init__() - self.config = config - self.dim = dim - self.blocks = nn.ModuleList( - [ - UnimerSwinLayer( - config=config, - dim=dim, - input_resolution=input_resolution, - num_heads=num_heads, - shift_size=0, - ) - for i in range(depth) - ] - ) - - # patch merging layer - if downsample is not None: - self.downsample = downsample(input_resolution, dim=dim, norm_layer=nn.LayerNorm) - else: - self.downsample = None - - self.pointing = False - - def forward( - self, - hidden_states: torch.Tensor, - input_dimensions: Tuple[int, int], - head_mask: Optional[torch.FloatTensor] = None, - output_attentions: Optional[bool] = False, - always_partition: Optional[bool] = False, - ) -> Tuple[torch.Tensor]: - height, width = input_dimensions - for i, layer_module in enumerate(self.blocks): - layer_head_mask = head_mask[i] if head_mask is not None else None - - layer_outputs = layer_module( - hidden_states, input_dimensions, layer_head_mask, output_attentions, always_partition - ) - - hidden_states = layer_outputs[0] - - hidden_states_before_downsampling = hidden_states - if self.downsample is not None: - height_downsampled, width_downsampled = (height + 1) // 2, (width + 1) // 2 - output_dimensions = (height, width, height_downsampled, width_downsampled) - hidden_states = self.downsample(hidden_states_before_downsampling, input_dimensions) - else: - output_dimensions = (height, width, height, width) - - stage_outputs = (hidden_states, hidden_states_before_downsampling, output_dimensions) - - if output_attentions: - stage_outputs += layer_outputs[1:] - return stage_outputs - - -# Copied from transformers.models.swin.modeling_swin.SwinEncoder with Swin->UnimerSwin -class UnimerSwinEncoder(nn.Module): - def __init__(self, config, grid_size): - super().__init__() - self.num_layers = len(config.depths) - self.config = config - dpr = [x.item() for x in torch.linspace(0, config.drop_path_rate, sum(config.depths))] - self.layers = nn.ModuleList( - [ - UnimerSwinStage( - config=config, - dim=int(config.embed_dim * 2**i_layer), - input_resolution=(grid_size[0] // (2**i_layer), grid_size[1] // (2**i_layer)), - depth=config.depths[i_layer], - num_heads=config.num_heads[i_layer], - drop_path=dpr[sum(config.depths[:i_layer]) : sum(config.depths[: i_layer + 1])], - downsample=UnimerSwinPatchMerging if (i_layer < self.num_layers - 1) else None, - ) - for i_layer in range(self.num_layers) - ] - ) - - self.gradient_checkpointing = False - - def forward( - self, - hidden_states: torch.Tensor, - input_dimensions: Tuple[int, int], - head_mask: Optional[torch.FloatTensor] = None, - output_attentions: Optional[bool] = False, - output_hidden_states: Optional[bool] = False, - output_hidden_states_before_downsampling: Optional[bool] = False, - always_partition: Optional[bool] = False, - return_dict: Optional[bool] = True, - ) -> Union[Tuple, UnimerSwinEncoderOutput]: - all_hidden_states = () if output_hidden_states else None - all_reshaped_hidden_states = () if output_hidden_states else None - all_self_attentions = () if output_attentions else None - - if output_hidden_states: - batch_size, _, hidden_size = hidden_states.shape - # rearrange b (h w) c -> b c h w - reshaped_hidden_state = hidden_states.view(batch_size, *input_dimensions, hidden_size) - reshaped_hidden_state = reshaped_hidden_state.permute(0, 3, 1, 2) - all_hidden_states += (hidden_states,) - all_reshaped_hidden_states += (reshaped_hidden_state,) - - for i, layer_module in enumerate(self.layers): - layer_head_mask = head_mask[i] if head_mask is not None else None - - if self.gradient_checkpointing and self.training: - layer_outputs = self._gradient_checkpointing_func( - layer_module.__call__, - hidden_states, - input_dimensions, - layer_head_mask, - output_attentions, - always_partition, - ) - else: - layer_outputs = layer_module( - hidden_states, input_dimensions, layer_head_mask, output_attentions, always_partition - ) - - hidden_states = layer_outputs[0] - hidden_states_before_downsampling = layer_outputs[1] - output_dimensions = layer_outputs[2] - - input_dimensions = (output_dimensions[-2], output_dimensions[-1]) - - if output_hidden_states and output_hidden_states_before_downsampling: - batch_size, _, hidden_size = hidden_states_before_downsampling.shape - # rearrange b (h w) c -> b c h w - # here we use the original (not downsampled) height and width - reshaped_hidden_state = hidden_states_before_downsampling.view( - batch_size, *(output_dimensions[0], output_dimensions[1]), hidden_size - ) - reshaped_hidden_state = reshaped_hidden_state.permute(0, 3, 1, 2) - all_hidden_states += (hidden_states_before_downsampling,) - all_reshaped_hidden_states += (reshaped_hidden_state,) - elif output_hidden_states and not output_hidden_states_before_downsampling: - batch_size, _, hidden_size = hidden_states.shape - # rearrange b (h w) c -> b c h w - reshaped_hidden_state = hidden_states.view(batch_size, *input_dimensions, hidden_size) - reshaped_hidden_state = reshaped_hidden_state.permute(0, 3, 1, 2) - all_hidden_states += (hidden_states,) - all_reshaped_hidden_states += (reshaped_hidden_state,) - - if output_attentions: - all_self_attentions += layer_outputs[3:] - - if not return_dict: - return tuple(v for v in [hidden_states, all_hidden_states, all_self_attentions] if v is not None) - - return UnimerSwinEncoderOutput( - last_hidden_state=hidden_states, - hidden_states=all_hidden_states, - attentions=all_self_attentions, - reshaped_hidden_states=all_reshaped_hidden_states, - ) - - -# Copied from transformers.models.swin.modeling_swin.SwinPreTrainedModel with Swin->UnimerSwin -class UnimerSwinPreTrainedModel(PreTrainedModel): - """ - An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained - models. - """ - - config_class = UnimerSwinConfig - base_model_prefix = "unimer-swin" - main_input_name = "pixel_values" - supports_gradient_checkpointing = True - _no_split_modules = ["UnimerSwinStage"] - - def _init_weights(self, module): - """Initialize the weights""" - if isinstance(module, (nn.Linear, nn.Conv2d)): - # Slightly different from the TF version which uses truncated_normal for initialization - # cf https://github.com/pytorch/pytorch/pull/5617 - module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) - if module.bias is not None: - module.bias.data.zero_() - elif isinstance(module, nn.LayerNorm): - module.bias.data.zero_() - module.weight.data.fill_(1.0) - - -SWIN_START_DOCSTRING = r""" - This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) sub-class. Use - it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and - behavior. - - Parameters: - config ([`UnimerSwinConfig`]): Model configuration class with all the parameters of the model. - Initializing with a config file does not load the weights associated with the model, only the - configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights. -""" - -SWIN_INPUTS_DOCSTRING = r""" - Args: - pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`): - Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See - [`DonutImageProcessor.__call__`] for details. - head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*): - Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`: - - - 1 indicates the head is **not masked**, - - 0 indicates the head is **masked**. - - output_attentions (`bool`, *optional*): - Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned - tensors for more detail. - output_hidden_states (`bool`, *optional*): - Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for - more detail. - interpolate_pos_encoding (`bool`, *optional*, defaults to `False`): - Whether to interpolate the pre-trained position encodings. - return_dict (`bool`, *optional*): - Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. -""" - - -@add_start_docstrings( - "The bare UnimerSwin Model transformer outputting raw hidden-states without any specific head on top.", - SWIN_START_DOCSTRING, -) -class UnimerSwinModel(UnimerSwinPreTrainedModel): - def __init__(self, config, add_pooling_layer=True, use_mask_token=False): - super().__init__(config) - self.config = config - self.num_layers = len(config.depths) - self.num_features = int(config.embed_dim * 2 ** (self.num_layers - 1)) - - self.embeddings = UnimerSwinEmbeddings(config, use_mask_token=use_mask_token) - self.encoder = UnimerSwinEncoder(config, self.embeddings.patch_grid) - self.pooler = nn.AdaptiveAvgPool1d(1) if add_pooling_layer else None - - # Initialize weights and apply final processing - self.post_init() - - def get_input_embeddings(self): - return self.embeddings.patch_embeddings - - def _prune_heads(self, heads_to_prune): - """ - Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base - class PreTrainedModel - """ - for layer, heads in heads_to_prune.items(): - self.encoder.layer[layer].attention.prune_heads(heads) - - @add_start_docstrings_to_model_forward(SWIN_INPUTS_DOCSTRING) - @add_code_sample_docstrings( - checkpoint=_CHECKPOINT_FOR_DOC, - output_type=UnimerSwinModelOutput, - config_class=_CONFIG_FOR_DOC, - modality="vision", - expected_output=_EXPECTED_OUTPUT_SHAPE, - ) - def forward( - self, - pixel_values: Optional[torch.FloatTensor] = None, - bool_masked_pos: Optional[torch.BoolTensor] = None, - head_mask: Optional[torch.FloatTensor] = None, - output_attentions: Optional[bool] = None, - output_hidden_states: Optional[bool] = None, - interpolate_pos_encoding: bool = False, - return_dict: Optional[bool] = None, - ) -> Union[Tuple, UnimerSwinModelOutput]: - r""" - bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, num_patches)`): - Boolean masked positions. Indicates which patches are masked (1) and which aren't (0). - """ - output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions - output_hidden_states = ( - output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states - ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - - if pixel_values is None: - raise ValueError("You have to specify pixel_values") - - # Prepare head mask if needed - # 1.0 in head_mask indicate we keep the head - # attention_probs has shape bsz x n_heads x N x N - # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads] - # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length] - head_mask = self.get_head_mask(head_mask, len(self.config.depths)) - - embedding_output, input_dimensions = self.embeddings( - pixel_values, bool_masked_pos=bool_masked_pos, interpolate_pos_encoding=interpolate_pos_encoding - ) - - encoder_outputs = self.encoder( - embedding_output, - input_dimensions, - head_mask=head_mask, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, - return_dict=return_dict, - ) - - sequence_output = encoder_outputs[0] - - pooled_output = None - if self.pooler is not None: - pooled_output = self.pooler(sequence_output.transpose(1, 2)) - pooled_output = torch.flatten(pooled_output, 1) - - if not return_dict: - output = (sequence_output, pooled_output) + encoder_outputs[1:] - - return output - - return UnimerSwinModelOutput( - last_hidden_state=sequence_output, - pooler_output=pooled_output, - hidden_states=encoder_outputs.hidden_states, - attentions=encoder_outputs.attentions, - reshaped_hidden_states=encoder_outputs.reshaped_hidden_states, - ) diff --git a/magic_pdf/model/sub_modules/model_init.py b/magic_pdf/model/sub_modules/model_init.py deleted file mode 100644 index b885606dd64599897c29d41acfca3f6a843beacc..0000000000000000000000000000000000000000 --- a/magic_pdf/model/sub_modules/model_init.py +++ /dev/null @@ -1,213 +0,0 @@ -import torch -from loguru import logger - -from magic_pdf.config.constants import MODEL_NAME -from magic_pdf.model.model_list import AtomicModel -from magic_pdf.model.sub_modules.language_detection.yolov11.YOLOv11 import YOLOv11LangDetModel -from magic_pdf.model.sub_modules.layout.doclayout_yolo.DocLayoutYOLO import DocLayoutYOLOModel -from magic_pdf.model.sub_modules.mfd.yolov8.YOLOv8 import YOLOv8MFDModel -from magic_pdf.model.sub_modules.mfr.unimernet.Unimernet import UnimernetModel -from magic_pdf.model.sub_modules.ocr.paddleocr2pytorch.pytorch_paddle import PytorchPaddleOCR -from magic_pdf.model.sub_modules.table.rapidtable.rapid_table import RapidTableModel -# try: -# from magic_pdf_ascend_plugin.libs.license_verifier import ( -# LicenseExpiredError, LicenseFormatError, LicenseSignatureError, -# load_license) -# from magic_pdf_ascend_plugin.model_plugin.ocr.paddleocr.ppocr_273_npu import ModifiedPaddleOCR -# from magic_pdf_ascend_plugin.model_plugin.table.rapidtable.rapid_table_npu import RapidTableModel -# license_key = load_license() -# logger.info(f'Using Ascend Plugin Success, License id is {license_key["payload"]["id"]},' -# f' License expired at {license_key["payload"]["date"]["end_date"]}') -# except Exception as e: -# if isinstance(e, ImportError): -# pass -# elif isinstance(e, LicenseFormatError): -# logger.error('Ascend Plugin: Invalid license format. Please check the license file.') -# elif isinstance(e, LicenseSignatureError): -# logger.error('Ascend Plugin: Invalid signature. The license may be tampered with.') -# elif isinstance(e, LicenseExpiredError): -# logger.error('Ascend Plugin: License has expired. Please renew your license.') -# elif isinstance(e, FileNotFoundError): -# logger.error('Ascend Plugin: Not found License file.') -# else: -# logger.error(f'Ascend Plugin: {e}') -# from magic_pdf.model.sub_modules.ocr.paddleocr.ppocr_273_mod import ModifiedPaddleOCR -# # from magic_pdf.model.sub_modules.ocr.paddleocr.ppocr_291_mod import ModifiedPaddleOCR -# from magic_pdf.model.sub_modules.table.rapidtable.rapid_table import RapidTableModel - - -def table_model_init(table_model_type, model_path, max_time, _device_='cpu', lang=None, table_sub_model_name=None): - if table_model_type == MODEL_NAME.STRUCT_EQTABLE: - from magic_pdf.model.sub_modules.table.structeqtable.struct_eqtable import StructTableModel - table_model = StructTableModel(model_path, max_new_tokens=2048, max_time=max_time) - elif table_model_type == MODEL_NAME.TABLE_MASTER: - from magic_pdf.model.sub_modules.table.tablemaster.tablemaster_paddle import TableMasterPaddleModel - config = { - 'model_dir': model_path, - 'device': _device_ - } - table_model = TableMasterPaddleModel(config) - elif table_model_type == MODEL_NAME.RAPID_TABLE: - atom_model_manager = AtomModelSingleton() - ocr_engine = atom_model_manager.get_atom_model( - atom_model_name='ocr', - ocr_show_log=False, - det_db_box_thresh=0.5, - det_db_unclip_ratio=1.6, - lang=lang - ) - table_model = RapidTableModel(ocr_engine, table_sub_model_name) - else: - logger.error('table model type not allow') - exit(1) - - return table_model - - -def mfd_model_init(weight, device='cpu'): - if str(device).startswith('npu'): - device = torch.device(device) - mfd_model = YOLOv8MFDModel(weight, device) - return mfd_model - - -def mfr_model_init(weight_dir, cfg_path, device='cpu'): - mfr_model = UnimernetModel(weight_dir, cfg_path, device) - return mfr_model - - -def layout_model_init(weight, config_file, device): - from magic_pdf.model.sub_modules.layout.layoutlmv3.model_init import Layoutlmv3_Predictor - model = Layoutlmv3_Predictor(weight, config_file, device) - return model - - -def doclayout_yolo_model_init(weight, device='cpu'): - if str(device).startswith('npu'): - device = torch.device(device) - model = DocLayoutYOLOModel(weight, device) - return model - - -def langdetect_model_init(langdetect_model_weight, device='cpu'): - if str(device).startswith('npu'): - device = torch.device(device) - model = YOLOv11LangDetModel(langdetect_model_weight, device) - return model - - -def ocr_model_init(show_log: bool = False, - det_db_box_thresh=0.3, - lang=None, - use_dilation=True, - det_db_unclip_ratio=1.8, - ): - if lang is not None and lang != '': - # model = ModifiedPaddleOCR( - model = PytorchPaddleOCR( - show_log=show_log, - det_db_box_thresh=det_db_box_thresh, - lang=lang, - use_dilation=use_dilation, - det_db_unclip_ratio=det_db_unclip_ratio, - ) - else: - # model = ModifiedPaddleOCR( - model = PytorchPaddleOCR( - show_log=show_log, - det_db_box_thresh=det_db_box_thresh, - use_dilation=use_dilation, - det_db_unclip_ratio=det_db_unclip_ratio, - ) - return model - - -class AtomModelSingleton: - _instance = None - _models = {} - - def __new__(cls, *args, **kwargs): - if cls._instance is None: - cls._instance = super().__new__(cls) - return cls._instance - - def get_atom_model(self, atom_model_name: str, **kwargs): - - lang = kwargs.get('lang', None) - layout_model_name = kwargs.get('layout_model_name', None) - table_model_name = kwargs.get('table_model_name', None) - - if atom_model_name in [AtomicModel.OCR]: - key = (atom_model_name, lang) - elif atom_model_name in [AtomicModel.Layout]: - key = (atom_model_name, layout_model_name) - elif atom_model_name in [AtomicModel.Table]: - key = (atom_model_name, table_model_name, lang) - else: - key = atom_model_name - - if key not in self._models: - self._models[key] = atom_model_init(model_name=atom_model_name, **kwargs) - return self._models[key] - -def atom_model_init(model_name: str, **kwargs): - atom_model = None - if model_name == AtomicModel.Layout: - if kwargs.get('layout_model_name') == MODEL_NAME.LAYOUTLMv3: - atom_model = layout_model_init( - kwargs.get('layout_weights'), - kwargs.get('layout_config_file'), - kwargs.get('device') - ) - elif kwargs.get('layout_model_name') == MODEL_NAME.DocLayout_YOLO: - atom_model = doclayout_yolo_model_init( - kwargs.get('doclayout_yolo_weights'), - kwargs.get('device') - ) - else: - logger.error('layout model name not allow') - exit(1) - elif model_name == AtomicModel.MFD: - atom_model = mfd_model_init( - kwargs.get('mfd_weights'), - kwargs.get('device') - ) - elif model_name == AtomicModel.MFR: - atom_model = mfr_model_init( - kwargs.get('mfr_weight_dir'), - kwargs.get('mfr_cfg_path'), - kwargs.get('device') - ) - elif model_name == AtomicModel.OCR: - atom_model = ocr_model_init( - kwargs.get('ocr_show_log'), - kwargs.get('det_db_box_thresh'), - kwargs.get('lang'), - ) - elif model_name == AtomicModel.Table: - atom_model = table_model_init( - kwargs.get('table_model_name'), - kwargs.get('table_model_path'), - kwargs.get('table_max_time'), - kwargs.get('device'), - kwargs.get('lang'), - kwargs.get('table_sub_model_name') - ) - elif model_name == AtomicModel.LangDetect: - if kwargs.get('langdetect_model_name') == MODEL_NAME.YOLO_V11_LangDetect: - atom_model = langdetect_model_init( - kwargs.get('langdetect_model_weight'), - kwargs.get('device') - ) - else: - logger.error('langdetect model name not allow') - exit(1) - else: - logger.error('model name not allow') - exit(1) - - if atom_model is None: - logger.error('model init failed') - exit(1) - else: - return atom_model diff --git a/magic_pdf/model/sub_modules/model_utils.py b/magic_pdf/model/sub_modules/model_utils.py deleted file mode 100644 index 04d0fbbd028290e46cab1adc5911674fe0541ef0..0000000000000000000000000000000000000000 --- a/magic_pdf/model/sub_modules/model_utils.py +++ /dev/null @@ -1,309 +0,0 @@ -import time -import torch -from loguru import logger -import numpy as np - -from magic_pdf.libs.boxbase import get_minbox_if_overlap_by_ratio -from magic_pdf.libs.clean_memory import clean_memory - - -def crop_img(input_res, input_np_img, crop_paste_x=0, crop_paste_y=0): - - crop_xmin, crop_ymin = int(input_res['poly'][0]), int(input_res['poly'][1]) - crop_xmax, crop_ymax = int(input_res['poly'][4]), int(input_res['poly'][5]) - - # Calculate new dimensions - crop_new_width = crop_xmax - crop_xmin + crop_paste_x * 2 - crop_new_height = crop_ymax - crop_ymin + crop_paste_y * 2 - - # Create a white background array - return_image = np.ones((crop_new_height, crop_new_width, 3), dtype=np.uint8) * 255 - - # Crop the original image using numpy slicing - cropped_img = input_np_img[crop_ymin:crop_ymax, crop_xmin:crop_xmax] - - # Paste the cropped image onto the white background - return_image[crop_paste_y:crop_paste_y + (crop_ymax - crop_ymin), - crop_paste_x:crop_paste_x + (crop_xmax - crop_xmin)] = cropped_img - - return_list = [crop_paste_x, crop_paste_y, crop_xmin, crop_ymin, crop_xmax, crop_ymax, crop_new_width, - crop_new_height] - return return_image, return_list - - -def get_coords_and_area(block_with_poly): - """Extract coordinates and area from a table.""" - xmin, ymin = int(block_with_poly['poly'][0]), int(block_with_poly['poly'][1]) - xmax, ymax = int(block_with_poly['poly'][4]), int(block_with_poly['poly'][5]) - area = (xmax - xmin) * (ymax - ymin) - return xmin, ymin, xmax, ymax, area - - -def calculate_intersection(box1, box2): - """Calculate intersection coordinates between two boxes.""" - intersection_xmin = max(box1[0], box2[0]) - intersection_ymin = max(box1[1], box2[1]) - intersection_xmax = min(box1[2], box2[2]) - intersection_ymax = min(box1[3], box2[3]) - - # Check if intersection is valid - if intersection_xmax <= intersection_xmin or intersection_ymax <= intersection_ymin: - return None - - return intersection_xmin, intersection_ymin, intersection_xmax, intersection_ymax - - -def calculate_iou(box1, box2): - """Calculate IoU between two boxes.""" - intersection = calculate_intersection(box1[:4], box2[:4]) - - if not intersection: - return 0 - - intersection_xmin, intersection_ymin, intersection_xmax, intersection_ymax = intersection - intersection_area = (intersection_xmax - intersection_xmin) * (intersection_ymax - intersection_ymin) - - area1, area2 = box1[4], box2[4] - union_area = area1 + area2 - intersection_area - - return intersection_area / union_area if union_area > 0 else 0 - - -def is_inside(small_box, big_box, overlap_threshold=0.8): - """Check if small_box is inside big_box by at least overlap_threshold.""" - intersection = calculate_intersection(small_box[:4], big_box[:4]) - - if not intersection: - return False - - intersection_xmin, intersection_ymin, intersection_xmax, intersection_ymax = intersection - intersection_area = (intersection_xmax - intersection_xmin) * (intersection_ymax - intersection_ymin) - - # Check if overlap exceeds threshold - return intersection_area >= overlap_threshold * small_box[4] - - -def do_overlap(box1, box2): - """Check if two boxes overlap.""" - return calculate_intersection(box1[:4], box2[:4]) is not None - - -def merge_high_iou_tables(table_res_list, layout_res, table_indices, iou_threshold=0.7): - """Merge tables with IoU > threshold.""" - if len(table_res_list) < 2: - return table_res_list, table_indices - - table_info = [get_coords_and_area(table) for table in table_res_list] - merged = True - - while merged: - merged = False - i = 0 - while i < len(table_res_list) - 1: - j = i + 1 - while j < len(table_res_list): - iou = calculate_iou(table_info[i], table_info[j]) - - if iou > iou_threshold: - # Merge tables by taking their union - x1_min, y1_min, x1_max, y1_max, _ = table_info[i] - x2_min, y2_min, x2_max, y2_max, _ = table_info[j] - - union_xmin = min(x1_min, x2_min) - union_ymin = min(y1_min, y2_min) - union_xmax = max(x1_max, x2_max) - union_ymax = max(y1_max, y2_max) - - # Create merged table - merged_table = table_res_list[i].copy() - merged_table['poly'][0] = union_xmin - merged_table['poly'][1] = union_ymin - merged_table['poly'][2] = union_xmax - merged_table['poly'][3] = union_ymin - merged_table['poly'][4] = union_xmax - merged_table['poly'][5] = union_ymax - merged_table['poly'][6] = union_xmin - merged_table['poly'][7] = union_ymax - - # Update layout_res - to_remove = [table_indices[j], table_indices[i]] - for idx in sorted(to_remove, reverse=True): - del layout_res[idx] - layout_res.append(merged_table) - - # Update tracking lists - table_indices = [k if k < min(to_remove) else - k - 1 if k < max(to_remove) else - k - 2 if k > max(to_remove) else - len(layout_res) - 1 - for k in table_indices - if k not in to_remove] - table_indices.append(len(layout_res) - 1) - - # Update table lists - table_res_list.pop(j) - table_res_list.pop(i) - table_res_list.append(merged_table) - - # Update table_info - table_info = [get_coords_and_area(table) for table in table_res_list] - - merged = True - break - j += 1 - - if merged: - break - i += 1 - - return table_res_list, table_indices - - -def filter_nested_tables(table_res_list, overlap_threshold=0.8, area_threshold=0.8): - """Remove big tables containing multiple smaller tables within them.""" - if len(table_res_list) < 3: - return table_res_list - - table_info = [get_coords_and_area(table) for table in table_res_list] - big_tables_idx = [] - - for i in range(len(table_res_list)): - # Find tables inside this one - tables_inside = [j for j in range(len(table_res_list)) - if i != j and is_inside(table_info[j], table_info[i], overlap_threshold)] - - # Continue if there are at least 3 tables inside - if len(tables_inside) >= 3: - # Check if inside tables overlap with each other - tables_overlap = any(do_overlap(table_info[tables_inside[idx1]], table_info[tables_inside[idx2]]) - for idx1 in range(len(tables_inside)) - for idx2 in range(idx1 + 1, len(tables_inside))) - - # If no overlaps, check area condition - if not tables_overlap: - total_inside_area = sum(table_info[j][4] for j in tables_inside) - big_table_area = table_info[i][4] - - if total_inside_area > area_threshold * big_table_area: - big_tables_idx.append(i) - - return [table for i, table in enumerate(table_res_list) if i not in big_tables_idx] - - -def remove_overlaps_min_blocks(res_list): - # 重叠block,小的不能直接删除,需要和大的那个合并成一个更大的。 - # 删除重叠blocks中较小的那些 - need_remove = [] - for res1 in res_list: - for res2 in res_list: - if res1 != res2: - overlap_box = get_minbox_if_overlap_by_ratio( - res1['bbox'], res2['bbox'], 0.8 - ) - if overlap_box is not None: - res_to_remove = next( - (res for res in res_list if res['bbox'] == overlap_box), - None, - ) - if ( - res_to_remove is not None - and res_to_remove not in need_remove - ): - large_res = res1 if res1 != res_to_remove else res2 - x1, y1, x2, y2 = large_res['bbox'] - sx1, sy1, sx2, sy2 = res_to_remove['bbox'] - x1 = min(x1, sx1) - y1 = min(y1, sy1) - x2 = max(x2, sx2) - y2 = max(y2, sy2) - large_res['bbox'] = [x1, y1, x2, y2] - need_remove.append(res_to_remove) - - if len(need_remove) > 0: - for res in need_remove: - res_list.remove(res) - - return res_list, need_remove - - -def get_res_list_from_layout_res(layout_res, iou_threshold=0.7, overlap_threshold=0.8, area_threshold=0.8): - """Extract OCR, table and other regions from layout results.""" - ocr_res_list = [] - text_res_list = [] - table_res_list = [] - table_indices = [] - single_page_mfdetrec_res = [] - - # Categorize regions - for i, res in enumerate(layout_res): - category_id = int(res['category_id']) - - if category_id in [13, 14]: # Formula regions - single_page_mfdetrec_res.append({ - "bbox": [int(res['poly'][0]), int(res['poly'][1]), - int(res['poly'][4]), int(res['poly'][5])], - }) - elif category_id in [0, 2, 4, 6, 7, 3]: # OCR regions - ocr_res_list.append(res) - elif category_id == 5: # Table regions - table_res_list.append(res) - table_indices.append(i) - elif category_id in [1]: # Text regions - res['bbox'] = [int(res['poly'][0]), int(res['poly'][1]), int(res['poly'][4]), int(res['poly'][5])] - text_res_list.append(res) - - # Process tables: merge high IoU tables first, then filter nested tables - table_res_list, table_indices = merge_high_iou_tables( - table_res_list, layout_res, table_indices, iou_threshold) - - filtered_table_res_list = filter_nested_tables( - table_res_list, overlap_threshold, area_threshold) - - # Remove filtered out tables from layout_res - if len(filtered_table_res_list) < len(table_res_list): - kept_tables = set(id(table) for table in filtered_table_res_list) - to_remove = [table_indices[i] for i, table in enumerate(table_res_list) - if id(table) not in kept_tables] - - for idx in sorted(to_remove, reverse=True): - del layout_res[idx] - - # Remove overlaps in OCR and text regions - text_res_list, need_remove = remove_overlaps_min_blocks(text_res_list) - for res in text_res_list: - # 将res的poly使用bbox重构 - res['poly'] = [res['bbox'][0], res['bbox'][1], res['bbox'][2], res['bbox'][1], - res['bbox'][2], res['bbox'][3], res['bbox'][0], res['bbox'][3]] - # 删除res的bbox - del res['bbox'] - - ocr_res_list.extend(text_res_list) - - if len(need_remove) > 0: - for res in need_remove: - del res['bbox'] - layout_res.remove(res) - - return ocr_res_list, filtered_table_res_list, single_page_mfdetrec_res - - -def clean_vram(device, vram_threshold=8): - total_memory = get_vram(device) - if total_memory and total_memory <= vram_threshold: - gc_start = time.time() - clean_memory(device) - gc_time = round(time.time() - gc_start, 2) - logger.info(f"gc time: {gc_time}") - - -def get_vram(device): - if torch.cuda.is_available() and str(device).startswith("cuda"): - total_memory = torch.cuda.get_device_properties(device).total_memory / (1024 ** 3) # 将字节转换为 GB - return total_memory - elif str(device).startswith("npu"): - import torch_npu - if torch_npu.npu.is_available(): - total_memory = torch_npu.npu.get_device_properties(device).total_memory / (1024 ** 3) # 转为 GB - return total_memory - else: - return None \ No newline at end of file diff --git a/magic_pdf/model/sub_modules/ocr/__init__.py b/magic_pdf/model/sub_modules/ocr/__init__.py deleted file mode 100644 index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000 diff --git a/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/ocr_utils.py b/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/ocr_utils.py deleted file mode 100644 index 70989fdf3958646b5778e4000d3750220a1d5c7a..0000000000000000000000000000000000000000 --- a/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/ocr_utils.py +++ /dev/null @@ -1,368 +0,0 @@ -# Copyright (c) Opendatalab. All rights reserved. -import copy - -import cv2 -import numpy as np -from magic_pdf.pre_proc.ocr_dict_merge import merge_spans_to_line -from magic_pdf.libs.boxbase import __is_overlaps_y_exceeds_threshold - - -def img_decode(content: bytes): - np_arr = np.frombuffer(content, dtype=np.uint8) - return cv2.imdecode(np_arr, cv2.IMREAD_UNCHANGED) - -def check_img(img): - if isinstance(img, bytes): - img = img_decode(img) - if isinstance(img, np.ndarray) and len(img.shape) == 2: - img = cv2.cvtColor(img, cv2.COLOR_GRAY2BGR) - return img - - -def alpha_to_color(img, alpha_color=(255, 255, 255)): - if len(img.shape) == 3 and img.shape[2] == 4: - B, G, R, A = cv2.split(img) - alpha = A / 255 - - R = (alpha_color[0] * (1 - alpha) + R * alpha).astype(np.uint8) - G = (alpha_color[1] * (1 - alpha) + G * alpha).astype(np.uint8) - B = (alpha_color[2] * (1 - alpha) + B * alpha).astype(np.uint8) - - img = cv2.merge((B, G, R)) - return img - - -def preprocess_image(_image): - alpha_color = (255, 255, 255) - _image = alpha_to_color(_image, alpha_color) - return _image - - -def sorted_boxes(dt_boxes): - """ - Sort text boxes in order from top to bottom, left to right - args: - dt_boxes(array):detected text boxes with shape [4, 2] - return: - sorted boxes(array) with shape [4, 2] - """ - num_boxes = dt_boxes.shape[0] - sorted_boxes = sorted(dt_boxes, key=lambda x: (x[0][1], x[0][0])) - _boxes = list(sorted_boxes) - - for i in range(num_boxes - 1): - for j in range(i, -1, -1): - if abs(_boxes[j + 1][0][1] - _boxes[j][0][1]) < 10 and \ - (_boxes[j + 1][0][0] < _boxes[j][0][0]): - tmp = _boxes[j] - _boxes[j] = _boxes[j + 1] - _boxes[j + 1] = tmp - else: - break - return _boxes - - -def bbox_to_points(bbox): - """ 将bbox格式转换为四个顶点的数组 """ - x0, y0, x1, y1 = bbox - return np.array([[x0, y0], [x1, y0], [x1, y1], [x0, y1]]).astype('float32') - - -def points_to_bbox(points): - """ 将四个顶点的数组转换为bbox格式 """ - x0, y0 = points[0] - x1, _ = points[1] - _, y1 = points[2] - return [x0, y0, x1, y1] - - -def merge_intervals(intervals): - # Sort the intervals based on the start value - intervals.sort(key=lambda x: x[0]) - - merged = [] - for interval in intervals: - # If the list of merged intervals is empty or if the current - # interval does not overlap with the previous, simply append it. - if not merged or merged[-1][1] < interval[0]: - merged.append(interval) - else: - # Otherwise, there is overlap, so we merge the current and previous intervals. - merged[-1][1] = max(merged[-1][1], interval[1]) - - return merged - - -def remove_intervals(original, masks): - # Merge all mask intervals - merged_masks = merge_intervals(masks) - - result = [] - original_start, original_end = original - - for mask in merged_masks: - mask_start, mask_end = mask - - # If the mask starts after the original range, ignore it - if mask_start > original_end: - continue - - # If the mask ends before the original range starts, ignore it - if mask_end < original_start: - continue - - # Remove the masked part from the original range - if original_start < mask_start: - result.append([original_start, mask_start - 1]) - - original_start = max(mask_end + 1, original_start) - - # Add the remaining part of the original range, if any - if original_start <= original_end: - result.append([original_start, original_end]) - - return result - - -def update_det_boxes(dt_boxes, mfd_res): - new_dt_boxes = [] - angle_boxes_list = [] - for text_box in dt_boxes: - - if calculate_is_angle(text_box): - angle_boxes_list.append(text_box) - continue - - text_bbox = points_to_bbox(text_box) - masks_list = [] - for mf_box in mfd_res: - mf_bbox = mf_box['bbox'] - if __is_overlaps_y_exceeds_threshold(text_bbox, mf_bbox): - masks_list.append([mf_bbox[0], mf_bbox[2]]) - text_x_range = [text_bbox[0], text_bbox[2]] - text_remove_mask_range = remove_intervals(text_x_range, masks_list) - temp_dt_box = [] - for text_remove_mask in text_remove_mask_range: - temp_dt_box.append(bbox_to_points([text_remove_mask[0], text_bbox[1], text_remove_mask[1], text_bbox[3]])) - if len(temp_dt_box) > 0: - new_dt_boxes.extend(temp_dt_box) - - new_dt_boxes.extend(angle_boxes_list) - - return new_dt_boxes - - -def merge_overlapping_spans(spans): - """ - Merges overlapping spans on the same line. - - :param spans: A list of span coordinates [(x1, y1, x2, y2), ...] - :return: A list of merged spans - """ - # Return an empty list if the input spans list is empty - if not spans: - return [] - - # Sort spans by their starting x-coordinate - spans.sort(key=lambda x: x[0]) - - # Initialize the list of merged spans - merged = [] - for span in spans: - # Unpack span coordinates - x1, y1, x2, y2 = span - # If the merged list is empty or there's no horizontal overlap, add the span directly - if not merged or merged[-1][2] < x1: - merged.append(span) - else: - # If there is horizontal overlap, merge the current span with the previous one - last_span = merged.pop() - # Update the merged span's top-left corner to the smaller (x1, y1) and bottom-right to the larger (x2, y2) - x1 = min(last_span[0], x1) - y1 = min(last_span[1], y1) - x2 = max(last_span[2], x2) - y2 = max(last_span[3], y2) - # Add the merged span back to the list - merged.append((x1, y1, x2, y2)) - - # Return the list of merged spans - return merged - - -def merge_det_boxes(dt_boxes): - """ - Merge detection boxes. - - This function takes a list of detected bounding boxes, each represented by four corner points. - The goal is to merge these bounding boxes into larger text regions. - - Parameters: - dt_boxes (list): A list containing multiple text detection boxes, where each box is defined by four corner points. - - Returns: - list: A list containing the merged text regions, where each region is represented by four corner points. - """ - # Convert the detection boxes into a dictionary format with bounding boxes and type - dt_boxes_dict_list = [] - angle_boxes_list = [] - for text_box in dt_boxes: - text_bbox = points_to_bbox(text_box) - - if calculate_is_angle(text_box): - angle_boxes_list.append(text_box) - continue - - text_box_dict = { - 'bbox': text_bbox, - 'type': 'text', - } - dt_boxes_dict_list.append(text_box_dict) - - # Merge adjacent text regions into lines - lines = merge_spans_to_line(dt_boxes_dict_list) - - # Initialize a new list for storing the merged text regions - new_dt_boxes = [] - for line in lines: - line_bbox_list = [] - for span in line: - line_bbox_list.append(span['bbox']) - - # Merge overlapping text regions within the same line - merged_spans = merge_overlapping_spans(line_bbox_list) - - # Convert the merged text regions back to point format and add them to the new detection box list - for span in merged_spans: - new_dt_boxes.append(bbox_to_points(span)) - - new_dt_boxes.extend(angle_boxes_list) - - return new_dt_boxes - - -def get_adjusted_mfdetrec_res(single_page_mfdetrec_res, useful_list): - paste_x, paste_y, xmin, ymin, xmax, ymax, new_width, new_height = useful_list - # Adjust the coordinates of the formula area - adjusted_mfdetrec_res = [] - for mf_res in single_page_mfdetrec_res: - mf_xmin, mf_ymin, mf_xmax, mf_ymax = mf_res["bbox"] - # Adjust the coordinates of the formula area to the coordinates relative to the cropping area - x0 = mf_xmin - xmin + paste_x - y0 = mf_ymin - ymin + paste_y - x1 = mf_xmax - xmin + paste_x - y1 = mf_ymax - ymin + paste_y - # Filter formula blocks outside the graph - if any([x1 < 0, y1 < 0]) or any([x0 > new_width, y0 > new_height]): - continue - else: - adjusted_mfdetrec_res.append({ - "bbox": [x0, y0, x1, y1], - }) - return adjusted_mfdetrec_res - - -def get_ocr_result_list(ocr_res, useful_list, ocr_enable, new_image, lang): - paste_x, paste_y, xmin, ymin, xmax, ymax, new_width, new_height = useful_list - ocr_result_list = [] - ori_im = new_image.copy() - for box_ocr_res in ocr_res: - - if len(box_ocr_res) == 2: - p1, p2, p3, p4 = box_ocr_res[0] - text, score = box_ocr_res[1] - # logger.info(f"text: {text}, score: {score}") - if score < 0.6: # 过滤低置信度的结果 - continue - else: - p1, p2, p3, p4 = box_ocr_res - text, score = "", 1 - - if ocr_enable: - tmp_box = copy.deepcopy(np.array([p1, p2, p3, p4]).astype('float32')) - img_crop = get_rotate_crop_image(ori_im, tmp_box) - - # average_angle_degrees = calculate_angle_degrees(box_ocr_res[0]) - # if average_angle_degrees > 0.5: - poly = [p1, p2, p3, p4] - if calculate_is_angle(poly): - # logger.info(f"average_angle_degrees: {average_angle_degrees}, text: {text}") - # 与x轴的夹角超过0.5度,对边界做一下矫正 - # 计算几何中心 - x_center = sum(point[0] for point in poly) / 4 - y_center = sum(point[1] for point in poly) / 4 - new_height = ((p4[1] - p1[1]) + (p3[1] - p2[1])) / 2 - new_width = p3[0] - p1[0] - p1 = [x_center - new_width / 2, y_center - new_height / 2] - p2 = [x_center + new_width / 2, y_center - new_height / 2] - p3 = [x_center + new_width / 2, y_center + new_height / 2] - p4 = [x_center - new_width / 2, y_center + new_height / 2] - - # Convert the coordinates back to the original coordinate system - p1 = [p1[0] - paste_x + xmin, p1[1] - paste_y + ymin] - p2 = [p2[0] - paste_x + xmin, p2[1] - paste_y + ymin] - p3 = [p3[0] - paste_x + xmin, p3[1] - paste_y + ymin] - p4 = [p4[0] - paste_x + xmin, p4[1] - paste_y + ymin] - - if ocr_enable: - ocr_result_list.append({ - 'category_id': 15, - 'poly': p1 + p2 + p3 + p4, - 'score': 1, - 'text': text, - 'np_img': img_crop, - 'lang': lang, - }) - else: - ocr_result_list.append({ - 'category_id': 15, - 'poly': p1 + p2 + p3 + p4, - 'score': float(round(score, 2)), - 'text': text, - }) - - return ocr_result_list - - -def calculate_is_angle(poly): - p1, p2, p3, p4 = poly - height = ((p4[1] - p1[1]) + (p3[1] - p2[1])) / 2 - if 0.8 * height <= (p3[1] - p1[1]) <= 1.2 * height: - return False - else: - # logger.info((p3[1] - p1[1])/height) - return True - - -def get_rotate_crop_image(img, points): - ''' - img_height, img_width = img.shape[0:2] - left = int(np.min(points[:, 0])) - right = int(np.max(points[:, 0])) - top = int(np.min(points[:, 1])) - bottom = int(np.max(points[:, 1])) - img_crop = img[top:bottom, left:right, :].copy() - points[:, 0] = points[:, 0] - left - points[:, 1] = points[:, 1] - top - ''' - assert len(points) == 4, "shape of points must be 4*2" - img_crop_width = int( - max( - np.linalg.norm(points[0] - points[1]), - np.linalg.norm(points[2] - points[3]))) - img_crop_height = int( - max( - np.linalg.norm(points[0] - points[3]), - np.linalg.norm(points[1] - points[2]))) - pts_std = np.float32([[0, 0], [img_crop_width, 0], - [img_crop_width, img_crop_height], - [0, img_crop_height]]) - M = cv2.getPerspectiveTransform(points, pts_std) - dst_img = cv2.warpPerspective( - img, - M, (img_crop_width, img_crop_height), - borderMode=cv2.BORDER_REPLICATE, - flags=cv2.INTER_CUBIC) - dst_img_height, dst_img_width = dst_img.shape[0:2] - if dst_img_height * 1.0 / dst_img_width >= 1.5: - dst_img = np.rot90(dst_img) - return dst_img \ No newline at end of file diff --git a/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorch_paddle.py b/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorch_paddle.py deleted file mode 100644 index 448bfda9287fb2ac63ba0ef92b6552fe21bb7680..0000000000000000000000000000000000000000 --- a/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorch_paddle.py +++ /dev/null @@ -1,199 +0,0 @@ -# Copyright (c) Opendatalab. All rights reserved. -import copy -import os.path -import warnings -from pathlib import Path - -import cv2 -import numpy as np -import yaml -from loguru import logger - -from magic_pdf.libs.config_reader import get_device, get_local_models_dir -from .ocr_utils import check_img, preprocess_image, sorted_boxes, merge_det_boxes, update_det_boxes, get_rotate_crop_image -from .tools.infer.predict_system import TextSystem -from .tools.infer import pytorchocr_utility as utility -import argparse - - -latin_lang = [ - 'af', 'az', 'bs', 'cs', 'cy', 'da', 'de', 'es', 'et', 'fr', 'ga', 'hr', # noqa: E126 - 'hu', 'id', 'is', 'it', 'ku', 'la', 'lt', 'lv', 'mi', 'ms', 'mt', 'nl', - 'no', 'oc', 'pi', 'pl', 'pt', 'ro', 'rs_latin', 'sk', 'sl', 'sq', 'sv', - 'sw', 'tl', 'tr', 'uz', 'vi', 'french', 'german' -] -arabic_lang = ['ar', 'fa', 'ug', 'ur'] -cyrillic_lang = [ - 'ru', 'rs_cyrillic', 'be', 'bg', 'uk', 'mn', 'abq', 'ady', 'kbd', 'ava', # noqa: E126 - 'dar', 'inh', 'che', 'lbe', 'lez', 'tab' -] -devanagari_lang = [ - 'hi', 'mr', 'ne', 'bh', 'mai', 'ang', 'bho', 'mah', 'sck', 'new', 'gom', # noqa: E126 - 'sa', 'bgc' -] - - -def get_model_params(lang, config): - if lang in config['lang']: - params = config['lang'][lang] - det = params.get('det') - rec = params.get('rec') - dict_file = params.get('dict') - return det, rec, dict_file - else: - raise Exception (f'Language {lang} not supported') - - -root_dir = Path(__file__).resolve().parent - - -class PytorchPaddleOCR(TextSystem): - def __init__(self, *args, **kwargs): - parser = utility.init_args() - args = parser.parse_args(args) - - self.lang = kwargs.get('lang', 'ch') - - device = get_device() - if device == 'cpu' and self.lang in ['ch', 'ch_server']: - logger.warning("The current device in use is CPU. To ensure the speed of parsing, the language is automatically switched to ch_lite.") - self.lang = 'ch_lite' - - if self.lang in latin_lang: - self.lang = 'latin' - elif self.lang in arabic_lang: - self.lang = 'arabic' - elif self.lang in cyrillic_lang: - self.lang = 'cyrillic' - elif self.lang in devanagari_lang: - self.lang = 'devanagari' - else: - pass - - models_config_path = os.path.join(root_dir, 'pytorchocr', 'utils', 'resources', 'models_config.yml') - with open(models_config_path) as file: - config = yaml.safe_load(file) - det, rec, dict_file = get_model_params(self.lang, config) - ocr_models_dir = os.path.join(get_local_models_dir(), 'OCR', 'paddleocr_torch') - kwargs['det_model_path'] = os.path.join(ocr_models_dir, det) - kwargs['rec_model_path'] = os.path.join(ocr_models_dir, rec) - kwargs['rec_char_dict_path'] = os.path.join(root_dir, 'pytorchocr', 'utils', 'resources', 'dict', dict_file) - # kwargs['rec_batch_num'] = 8 - - kwargs['device'] = device - - default_args = vars(args) - default_args.update(kwargs) - args = argparse.Namespace(**default_args) - - super().__init__(args) - - def ocr(self, - img, - det=True, - rec=True, - mfd_res=None, - tqdm_enable=False, - ): - assert isinstance(img, (np.ndarray, list, str, bytes)) - if isinstance(img, list) and det == True: - logger.error('When input a list of images, det must be false') - exit(0) - img = check_img(img) - imgs = [img] - with warnings.catch_warnings(): - warnings.simplefilter("ignore", category=RuntimeWarning) - if det and rec: - ocr_res = [] - for img in imgs: - img = preprocess_image(img) - dt_boxes, rec_res = self.__call__(img, mfd_res=mfd_res) - if not dt_boxes and not rec_res: - ocr_res.append(None) - continue - tmp_res = [[box.tolist(), res] for box, res in zip(dt_boxes, rec_res)] - ocr_res.append(tmp_res) - return ocr_res - elif det and not rec: - ocr_res = [] - for img in imgs: - img = preprocess_image(img) - dt_boxes, elapse = self.text_detector(img) - # logger.debug("dt_boxes num : {}, elapsed : {}".format(len(dt_boxes), elapse)) - if dt_boxes is None: - ocr_res.append(None) - continue - dt_boxes = sorted_boxes(dt_boxes) - # merge_det_boxes 和 update_det_boxes 都会把poly转成bbox再转回poly,因此需要过滤所有倾斜程度较大的文本框 - dt_boxes = merge_det_boxes(dt_boxes) - if mfd_res: - dt_boxes = update_det_boxes(dt_boxes, mfd_res) - tmp_res = [box.tolist() for box in dt_boxes] - ocr_res.append(tmp_res) - return ocr_res - elif not det and rec: - ocr_res = [] - for img in imgs: - if not isinstance(img, list): - img = preprocess_image(img) - img = [img] - rec_res, elapse = self.text_recognizer(img, tqdm_enable=tqdm_enable) - # logger.debug("rec_res num : {}, elapsed : {}".format(len(rec_res), elapse)) - ocr_res.append(rec_res) - return ocr_res - - def __call__(self, img, mfd_res=None): - - if img is None: - logger.debug("no valid image provided") - return None, None - - ori_im = img.copy() - dt_boxes, elapse = self.text_detector(img) - - if dt_boxes is None: - logger.debug("no dt_boxes found, elapsed : {}".format(elapse)) - return None, None - else: - pass - # logger.debug("dt_boxes num : {}, elapsed : {}".format(len(dt_boxes), elapse)) - img_crop_list = [] - - dt_boxes = sorted_boxes(dt_boxes) - - # merge_det_boxes 和 update_det_boxes 都会把poly转成bbox再转回poly,因此需要过滤所有倾斜程度较大的文本框 - dt_boxes = merge_det_boxes(dt_boxes) - - if mfd_res: - dt_boxes = update_det_boxes(dt_boxes, mfd_res) - - for bno in range(len(dt_boxes)): - tmp_box = copy.deepcopy(dt_boxes[bno]) - img_crop = get_rotate_crop_image(ori_im, tmp_box) - img_crop_list.append(img_crop) - - rec_res, elapse = self.text_recognizer(img_crop_list) - # logger.debug("rec_res num : {}, elapsed : {}".format(len(rec_res), elapse)) - - filter_boxes, filter_rec_res = [], [] - for box, rec_result in zip(dt_boxes, rec_res): - text, score = rec_result - if score >= self.drop_score: - filter_boxes.append(box) - filter_rec_res.append(rec_result) - - return filter_boxes, filter_rec_res - -if __name__ == '__main__': - pytorch_paddle_ocr = PytorchPaddleOCR() - img = cv2.imread("/Users/myhloli/Downloads/screenshot-20250326-194348.png") - dt_boxes, rec_res = pytorch_paddle_ocr(img) - ocr_res = [] - if not dt_boxes and not rec_res: - ocr_res.append(None) - else: - tmp_res = [[box.tolist(), res] for box, res in zip(dt_boxes, rec_res)] - ocr_res.append(tmp_res) - print(ocr_res) - - diff --git a/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/__init__.py b/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/__init__.py deleted file mode 100755 index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000 diff --git a/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/base_ocr_v20.py b/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/base_ocr_v20.py deleted file mode 100755 index c169d20db9d3b3ea799e1c304ce8684cd8f12362..0000000000000000000000000000000000000000 --- a/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/base_ocr_v20.py +++ /dev/null @@ -1,39 +0,0 @@ -import os -import torch -from .modeling.architectures.base_model import BaseModel - -class BaseOCRV20: - def __init__(self, config, **kwargs): - self.config = config - self.build_net(**kwargs) - self.net.eval() - - - def build_net(self, **kwargs): - self.net = BaseModel(self.config, **kwargs) - - def read_pytorch_weights(self, weights_path): - if not os.path.exists(weights_path): - raise FileNotFoundError('{} is not existed.'.format(weights_path)) - weights = torch.load(weights_path) - return weights - - def get_out_channels(self, weights): - if list(weights.keys())[-1].endswith('.weight') and len(list(weights.values())[-1].shape) == 2: - out_channels = list(weights.values())[-1].numpy().shape[1] - else: - out_channels = list(weights.values())[-1].numpy().shape[0] - return out_channels - - def load_state_dict(self, weights): - self.net.load_state_dict(weights) - # print('weights is loaded.') - - def load_pytorch_weights(self, weights_path): - self.net.load_state_dict(torch.load(weights_path, weights_only=True)) - # print('model is loaded: {}'.format(weights_path)) - - def inference(self, inputs): - with torch.no_grad(): - infer = self.net(inputs) - return infer diff --git a/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/data/__init__.py b/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/data/__init__.py deleted file mode 100755 index 9eef2969a0854c6fc295c3696ba153d300e7c2f1..0000000000000000000000000000000000000000 --- a/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/data/__init__.py +++ /dev/null @@ -1,8 +0,0 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals - -from .imaug import transform, create_operators - - diff --git a/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/data/imaug/__init__.py b/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/data/imaug/__init__.py deleted file mode 100755 index 13abd6741c581fcb6d042854404f65c49213e9d9..0000000000000000000000000000000000000000 --- a/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/data/imaug/__init__.py +++ /dev/null @@ -1,48 +0,0 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals - -# from .iaa_augment import IaaAugment -# from .make_border_map import MakeBorderMap -# from .make_shrink_map import MakeShrinkMap -# from .random_crop_data import EastRandomCropData, PSERandomCrop - -# from .rec_img_aug import RecAug, RecResizeImg, ClsResizeImg -# from .randaugment import RandAugment -from .operators import * -# from .label_ops import * - -# from .east_process import * -# from .sast_process import * -# from .gen_table_mask import * - -def transform(data, ops=None): - """ transform """ - if ops is None: - ops = [] - for op in ops: - data = op(data) - if data is None: - return None - return data - - -def create_operators(op_param_list, global_config=None): - """ - create operators based on the config - Args: - params(list): a dict list, used to create some operators - """ - assert isinstance(op_param_list, list), ('operator config should be a list') - ops = [] - for operator in op_param_list: - assert isinstance(operator, - dict) and len(operator) == 1, "yaml format error" - op_name = list(operator)[0] - param = {} if operator[op_name] is None else operator[op_name] - if global_config is not None: - param.update(global_config) - op = eval(op_name)(**param) - ops.append(op) - return ops \ No newline at end of file diff --git a/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/data/imaug/operators.py b/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/data/imaug/operators.py deleted file mode 100755 index daa67a25dae93dde74fc0b92aad4aa6ef4d4c003..0000000000000000000000000000000000000000 --- a/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/data/imaug/operators.py +++ /dev/null @@ -1,418 +0,0 @@ -""" -# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -""" - -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals - -import sys -import six -import cv2 -import numpy as np - - -class DecodeImage(object): - """ decode image """ - - def __init__(self, img_mode='RGB', channel_first=False, **kwargs): - self.img_mode = img_mode - self.channel_first = channel_first - - def __call__(self, data): - img = data['image'] - if six.PY2: - assert type(img) is str and len( - img) > 0, "invalid input 'img' in DecodeImage" - else: - assert type(img) is bytes and len( - img) > 0, "invalid input 'img' in DecodeImage" - img = np.frombuffer(img, dtype='uint8') - img = cv2.imdecode(img, 1) - if img is None: - return None - if self.img_mode == 'GRAY': - img = cv2.cvtColor(img, cv2.COLOR_GRAY2BGR) - elif self.img_mode == 'RGB': - assert img.shape[2] == 3, 'invalid shape of image[%s]' % (img.shape) - img = img[:, :, ::-1] - - if self.channel_first: - img = img.transpose((2, 0, 1)) - - data['image'] = img - return data - - -class NRTRDecodeImage(object): - """ decode image """ - - def __init__(self, img_mode='RGB', channel_first=False, **kwargs): - self.img_mode = img_mode - self.channel_first = channel_first - - def __call__(self, data): - img = data['image'] - if six.PY2: - assert type(img) is str and len( - img) > 0, "invalid input 'img' in DecodeImage" - else: - assert type(img) is bytes and len( - img) > 0, "invalid input 'img' in DecodeImage" - img = np.frombuffer(img, dtype='uint8') - - img = cv2.imdecode(img, 1) - - if img is None: - return None - if self.img_mode == 'GRAY': - img = cv2.cvtColor(img, cv2.COLOR_GRAY2BGR) - elif self.img_mode == 'RGB': - assert img.shape[2] == 3, 'invalid shape of image[%s]' % (img.shape) - img = img[:, :, ::-1] - img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) - if self.channel_first: - img = img.transpose((2, 0, 1)) - data['image'] = img - return data - - -class NormalizeImage(object): - """ normalize image such as substract mean, divide std - """ - - def __init__(self, scale=None, mean=None, std=None, order='chw', **kwargs): - if isinstance(scale, str): - scale = eval(scale) - self.scale = np.float32(scale if scale is not None else 1.0 / 255.0) - mean = mean if mean is not None else [0.485, 0.456, 0.406] - std = std if std is not None else [0.229, 0.224, 0.225] - - shape = (3, 1, 1) if order == 'chw' else (1, 1, 3) - self.mean = np.array(mean).reshape(shape).astype('float32') - self.std = np.array(std).reshape(shape).astype('float32') - - def __call__(self, data): - img = data['image'] - from PIL import Image - if isinstance(img, Image.Image): - img = np.array(img) - assert isinstance(img, - np.ndarray), "invalid input 'img' in NormalizeImage" - data['image'] = ( - img.astype('float32') * self.scale - self.mean) / self.std - return data - - -class ToCHWImage(object): - """ convert hwc image to chw image - """ - - def __init__(self, **kwargs): - pass - - def __call__(self, data): - img = data['image'] - from PIL import Image - if isinstance(img, Image.Image): - img = np.array(img) - data['image'] = img.transpose((2, 0, 1)) - return data - - -class Fasttext(object): - def __init__(self, path="None", **kwargs): - import fasttext - self.fast_model = fasttext.load_model(path) - - def __call__(self, data): - label = data['label'] - fast_label = self.fast_model[label] - data['fast_label'] = fast_label - return data - - -class KeepKeys(object): - def __init__(self, keep_keys, **kwargs): - self.keep_keys = keep_keys - - def __call__(self, data): - data_list = [] - for key in self.keep_keys: - data_list.append(data[key]) - return data_list - - -class Resize(object): - def __init__(self, size=(640, 640), **kwargs): - self.size = size - - def resize_image(self, img): - resize_h, resize_w = self.size - ori_h, ori_w = img.shape[:2] # (h, w, c) - ratio_h = float(resize_h) / ori_h - ratio_w = float(resize_w) / ori_w - img = cv2.resize(img, (int(resize_w), int(resize_h))) - return img, [ratio_h, ratio_w] - - def __call__(self, data): - img = data['image'] - text_polys = data['polys'] - - img_resize, [ratio_h, ratio_w] = self.resize_image(img) - new_boxes = [] - for box in text_polys: - new_box = [] - for cord in box: - new_box.append([cord[0] * ratio_w, cord[1] * ratio_h]) - new_boxes.append(new_box) - data['image'] = img_resize - data['polys'] = np.array(new_boxes, dtype=np.float32) - return data - - -class DetResizeForTest(object): - def __init__(self, **kwargs): - super(DetResizeForTest, self).__init__() - self.resize_type = 0 - if 'image_shape' in kwargs: - self.image_shape = kwargs['image_shape'] - self.resize_type = 1 - elif 'limit_side_len' in kwargs: - self.limit_side_len = kwargs['limit_side_len'] - self.limit_type = kwargs.get('limit_type', 'min') - elif 'resize_long' in kwargs: - self.resize_type = 2 - self.resize_long = kwargs.get('resize_long', 960) - else: - self.limit_side_len = 736 - self.limit_type = 'min' - - def __call__(self, data): - img = data['image'] - src_h, src_w, _ = img.shape - - if self.resize_type == 0: - # img, shape = self.resize_image_type0(img) - img, [ratio_h, ratio_w] = self.resize_image_type0(img) - elif self.resize_type == 2: - img, [ratio_h, ratio_w] = self.resize_image_type2(img) - else: - # img, shape = self.resize_image_type1(img) - img, [ratio_h, ratio_w] = self.resize_image_type1(img) - data['image'] = img - data['shape'] = np.array([src_h, src_w, ratio_h, ratio_w]) - return data - - def resize_image_type1(self, img): - resize_h, resize_w = self.image_shape - ori_h, ori_w = img.shape[:2] # (h, w, c) - ratio_h = float(resize_h) / ori_h - ratio_w = float(resize_w) / ori_w - img = cv2.resize(img, (int(resize_w), int(resize_h))) - # return img, np.array([ori_h, ori_w]) - return img, [ratio_h, ratio_w] - - def resize_image_type0(self, img): - """ - resize image to a size multiple of 32 which is required by the network - args: - img(array): array with shape [h, w, c] - return(tuple): - img, (ratio_h, ratio_w) - """ - limit_side_len = self.limit_side_len - h, w, c = img.shape - - # limit the max side - if self.limit_type == 'max': - if max(h, w) > limit_side_len: - if h > w: - ratio = float(limit_side_len) / h - else: - ratio = float(limit_side_len) / w - else: - ratio = 1. - elif self.limit_type == 'min': - if min(h, w) < limit_side_len: - if h < w: - ratio = float(limit_side_len) / h - else: - ratio = float(limit_side_len) / w - else: - ratio = 1. - elif self.limit_type == 'resize_long': - ratio = float(limit_side_len) / max(h, w) - else: - raise Exception('not support limit type, image ') - resize_h = int(h * ratio) - resize_w = int(w * ratio) - - resize_h = max(int(round(resize_h / 32) * 32), 32) - resize_w = max(int(round(resize_w / 32) * 32), 32) - - try: - if int(resize_w) <= 0 or int(resize_h) <= 0: - return None, (None, None) - img = cv2.resize(img, (int(resize_w), int(resize_h))) - except: - print(img.shape, resize_w, resize_h) - sys.exit(0) - ratio_h = resize_h / float(h) - ratio_w = resize_w / float(w) - return img, [ratio_h, ratio_w] - - def resize_image_type2(self, img): - h, w, _ = img.shape - - resize_w = w - resize_h = h - - if resize_h > resize_w: - ratio = float(self.resize_long) / resize_h - else: - ratio = float(self.resize_long) / resize_w - - resize_h = int(resize_h * ratio) - resize_w = int(resize_w * ratio) - - max_stride = 128 - resize_h = (resize_h + max_stride - 1) // max_stride * max_stride - resize_w = (resize_w + max_stride - 1) // max_stride * max_stride - img = cv2.resize(img, (int(resize_w), int(resize_h))) - ratio_h = resize_h / float(h) - ratio_w = resize_w / float(w) - - return img, [ratio_h, ratio_w] - - -class E2EResizeForTest(object): - def __init__(self, **kwargs): - super(E2EResizeForTest, self).__init__() - self.max_side_len = kwargs['max_side_len'] - self.valid_set = kwargs['valid_set'] - - def __call__(self, data): - img = data['image'] - src_h, src_w, _ = img.shape - if self.valid_set == 'totaltext': - im_resized, [ratio_h, ratio_w] = self.resize_image_for_totaltext( - img, max_side_len=self.max_side_len) - else: - im_resized, (ratio_h, ratio_w) = self.resize_image( - img, max_side_len=self.max_side_len) - data['image'] = im_resized - data['shape'] = np.array([src_h, src_w, ratio_h, ratio_w]) - return data - - def resize_image_for_totaltext(self, im, max_side_len=512): - - h, w, _ = im.shape - resize_w = w - resize_h = h - ratio = 1.25 - if h * ratio > max_side_len: - ratio = float(max_side_len) / resize_h - resize_h = int(resize_h * ratio) - resize_w = int(resize_w * ratio) - - max_stride = 128 - resize_h = (resize_h + max_stride - 1) // max_stride * max_stride - resize_w = (resize_w + max_stride - 1) // max_stride * max_stride - im = cv2.resize(im, (int(resize_w), int(resize_h))) - ratio_h = resize_h / float(h) - ratio_w = resize_w / float(w) - return im, (ratio_h, ratio_w) - - def resize_image(self, im, max_side_len=512): - """ - resize image to a size multiple of max_stride which is required by the network - :param im: the resized image - :param max_side_len: limit of max image size to avoid out of memory in gpu - :return: the resized image and the resize ratio - """ - h, w, _ = im.shape - - resize_w = w - resize_h = h - - # Fix the longer side - if resize_h > resize_w: - ratio = float(max_side_len) / resize_h - else: - ratio = float(max_side_len) / resize_w - - resize_h = int(resize_h * ratio) - resize_w = int(resize_w * ratio) - - max_stride = 128 - resize_h = (resize_h + max_stride - 1) // max_stride * max_stride - resize_w = (resize_w + max_stride - 1) // max_stride * max_stride - im = cv2.resize(im, (int(resize_w), int(resize_h))) - ratio_h = resize_h / float(h) - ratio_w = resize_w / float(w) - - return im, (ratio_h, ratio_w) - - -class KieResize(object): - def __init__(self, **kwargs): - super(KieResize, self).__init__() - self.max_side, self.min_side = kwargs['img_scale'][0], kwargs[ - 'img_scale'][1] - - def __call__(self, data): - img = data['image'] - points = data['points'] - src_h, src_w, _ = img.shape - im_resized, scale_factor, [ratio_h, ratio_w - ], [new_h, new_w] = self.resize_image(img) - resize_points = self.resize_boxes(img, points, scale_factor) - data['ori_image'] = img - data['ori_boxes'] = points - data['points'] = resize_points - data['image'] = im_resized - data['shape'] = np.array([new_h, new_w]) - return data - - def resize_image(self, img): - norm_img = np.zeros([1024, 1024, 3], dtype='float32') - scale = [512, 1024] - h, w = img.shape[:2] - max_long_edge = max(scale) - max_short_edge = min(scale) - scale_factor = min(max_long_edge / max(h, w), - max_short_edge / min(h, w)) - resize_w, resize_h = int(w * float(scale_factor) + 0.5), int(h * float( - scale_factor) + 0.5) - max_stride = 32 - resize_h = (resize_h + max_stride - 1) // max_stride * max_stride - resize_w = (resize_w + max_stride - 1) // max_stride * max_stride - im = cv2.resize(img, (resize_w, resize_h)) - new_h, new_w = im.shape[:2] - w_scale = new_w / w - h_scale = new_h / h - scale_factor = np.array( - [w_scale, h_scale, w_scale, h_scale], dtype=np.float32) - norm_img[:new_h, :new_w, :] = im - return norm_img, scale_factor, [h_scale, w_scale], [new_h, new_w] - - def resize_boxes(self, im, points, scale_factor): - points = points * scale_factor - img_shape = im.shape[:2] - points[:, 0::2] = np.clip(points[:, 0::2], 0, img_shape[1]) - points[:, 1::2] = np.clip(points[:, 1::2], 0, img_shape[0]) - return points diff --git a/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/__init__.py b/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/__init__.py deleted file mode 100644 index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000 diff --git a/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/architectures/__init__.py b/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/architectures/__init__.py deleted file mode 100644 index 7ad5eb47c2efb04ef0b1ecdea9e2173acdf6706d..0000000000000000000000000000000000000000 --- a/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/architectures/__init__.py +++ /dev/null @@ -1,25 +0,0 @@ -# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import copy - -__all__ = ["build_model"] - - -def build_model(config, **kwargs): - from .base_model import BaseModel - - config = copy.deepcopy(config) - module_class = BaseModel(config, **kwargs) - return module_class diff --git a/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/architectures/base_model.py b/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/architectures/base_model.py deleted file mode 100644 index e7f7ce49b7201f99e050cb8d83b3eb0fb318760d..0000000000000000000000000000000000000000 --- a/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/architectures/base_model.py +++ /dev/null @@ -1,105 +0,0 @@ -from torch import nn - -from ..backbones import build_backbone -from ..heads import build_head -from ..necks import build_neck - - -class BaseModel(nn.Module): - def __init__(self, config, **kwargs): - """ - the module for OCR. - args: - config (dict): the super parameters for module. - """ - super(BaseModel, self).__init__() - - in_channels = config.get("in_channels", 3) - model_type = config["model_type"] - # build backbone, backbone is need for del, rec and cls - if "Backbone" not in config or config["Backbone"] is None: - self.use_backbone = False - else: - self.use_backbone = True - config["Backbone"]["in_channels"] = in_channels - self.backbone = build_backbone(config["Backbone"], model_type) - in_channels = self.backbone.out_channels - - # build neck - # for rec, neck can be cnn,rnn or reshape(None) - # for det, neck can be FPN, BIFPN and so on. - # for cls, neck should be none - if "Neck" not in config or config["Neck"] is None: - self.use_neck = False - else: - self.use_neck = True - config["Neck"]["in_channels"] = in_channels - self.neck = build_neck(config["Neck"]) - in_channels = self.neck.out_channels - - # # build head, head is need for det, rec and cls - if "Head" not in config or config["Head"] is None: - self.use_head = False - else: - self.use_head = True - config["Head"]["in_channels"] = in_channels - self.head = build_head(config["Head"], **kwargs) - - self.return_all_feats = config.get("return_all_feats", False) - - self._initialize_weights() - - def _initialize_weights(self): - # weight initialization - for m in self.modules(): - if isinstance(m, nn.Conv2d): - nn.init.kaiming_normal_(m.weight, mode="fan_out") - if m.bias is not None: - nn.init.zeros_(m.bias) - elif isinstance(m, nn.BatchNorm2d): - nn.init.ones_(m.weight) - nn.init.zeros_(m.bias) - elif isinstance(m, nn.Linear): - nn.init.normal_(m.weight, 0, 0.01) - if m.bias is not None: - nn.init.zeros_(m.bias) - elif isinstance(m, nn.ConvTranspose2d): - nn.init.kaiming_normal_(m.weight, mode="fan_out") - if m.bias is not None: - nn.init.zeros_(m.bias) - - def forward(self, x): - y = dict() - if self.use_backbone: - x = self.backbone(x) - if isinstance(x, dict): - y.update(x) - else: - y["backbone_out"] = x - final_name = "backbone_out" - if self.use_neck: - x = self.neck(x) - if isinstance(x, dict): - y.update(x) - else: - y["neck_out"] = x - final_name = "neck_out" - if self.use_head: - x = self.head(x) - # for multi head, save ctc neck out for udml - if isinstance(x, dict) and "ctc_nect" in x.keys(): - y["neck_out"] = x["ctc_neck"] - y["head_out"] = x - elif isinstance(x, dict): - y.update(x) - else: - y["head_out"] = x - if self.return_all_feats: - if self.training: - return y - elif isinstance(x, dict): - return x - else: - return {final_name: x} - else: - return x diff --git a/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/backbones/__init__.py b/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/backbones/__init__.py deleted file mode 100644 index 7f437a2388b1640995e0909595fcb1eaf6544dff..0000000000000000000000000000000000000000 --- a/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/backbones/__init__.py +++ /dev/null @@ -1,63 +0,0 @@ -# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -__all__ = ["build_backbone"] - - -def build_backbone(config, model_type): - if model_type == "det": - from .det_mobilenet_v3 import MobileNetV3 - from .rec_hgnet import PPHGNet_small - from .rec_lcnetv3 import PPLCNetV3 - - support_dict = [ - "MobileNetV3", - "ResNet", - "ResNet_vd", - "ResNet_SAST", - "PPLCNetV3", - "PPHGNet_small", - ] - elif model_type == "rec" or model_type == "cls": - from .rec_hgnet import PPHGNet_small - from .rec_lcnetv3 import PPLCNetV3 - from .rec_mobilenet_v3 import MobileNetV3 - from .rec_svtrnet import SVTRNet - from .rec_mv1_enhance import MobileNetV1Enhance - from .rec_pphgnetv2 import PPHGNetV2_B4 - support_dict = [ - "MobileNetV1Enhance", - "MobileNetV3", - "ResNet", - "ResNetFPN", - "MTB", - "ResNet31", - "SVTRNet", - "ViTSTR", - "DenseNet", - "PPLCNetV3", - "PPHGNet_small", - "PPHGNetV2_B4", - ] - else: - raise NotImplementedError - - module_name = config.pop("name") - assert module_name in support_dict, Exception( - "when model typs is {}, backbone only support {}".format( - model_type, support_dict - ) - ) - module_class = eval(module_name)(**config) - return module_class diff --git a/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/backbones/det_mobilenet_v3.py b/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/backbones/det_mobilenet_v3.py deleted file mode 100644 index 03511599a0fb6d0d18940e9cd2fef19d217ec6ea..0000000000000000000000000000000000000000 --- a/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/backbones/det_mobilenet_v3.py +++ /dev/null @@ -1,269 +0,0 @@ -from torch import nn - -from ..common import Activation - - -def make_divisible(v, divisor=8, min_value=None): - if min_value is None: - min_value = divisor - new_v = max(min_value, int(v + divisor / 2) // divisor * divisor) - if new_v < 0.9 * v: - new_v += divisor - return new_v - - -class ConvBNLayer(nn.Module): - def __init__( - self, - in_channels, - out_channels, - kernel_size, - stride, - padding, - groups=1, - if_act=True, - act=None, - name=None, - ): - super(ConvBNLayer, self).__init__() - self.if_act = if_act - self.conv = nn.Conv2d( - in_channels=in_channels, - out_channels=out_channels, - kernel_size=kernel_size, - stride=stride, - padding=padding, - groups=groups, - bias=False, - ) - - self.bn = nn.BatchNorm2d( - out_channels, - ) - if self.if_act: - self.act = Activation(act_type=act, inplace=True) - - def forward(self, x): - x = self.conv(x) - x = self.bn(x) - if self.if_act: - x = self.act(x) - return x - - -class SEModule(nn.Module): - def __init__(self, in_channels, reduction=4, name=""): - super(SEModule, self).__init__() - self.avg_pool = nn.AdaptiveAvgPool2d(1) - self.conv1 = nn.Conv2d( - in_channels=in_channels, - out_channels=in_channels // reduction, - kernel_size=1, - stride=1, - padding=0, - bias=True, - ) - self.relu1 = Activation(act_type="relu", inplace=True) - self.conv2 = nn.Conv2d( - in_channels=in_channels // reduction, - out_channels=in_channels, - kernel_size=1, - stride=1, - padding=0, - bias=True, - ) - self.hard_sigmoid = Activation(act_type="hard_sigmoid", inplace=True) - - def forward(self, inputs): - outputs = self.avg_pool(inputs) - outputs = self.conv1(outputs) - outputs = self.relu1(outputs) - outputs = self.conv2(outputs) - outputs = self.hard_sigmoid(outputs) - outputs = inputs * outputs - return outputs - - -class ResidualUnit(nn.Module): - def __init__( - self, - in_channels, - mid_channels, - out_channels, - kernel_size, - stride, - use_se, - act=None, - name="", - ): - super(ResidualUnit, self).__init__() - self.if_shortcut = stride == 1 and in_channels == out_channels - self.if_se = use_se - - self.expand_conv = ConvBNLayer( - in_channels=in_channels, - out_channels=mid_channels, - kernel_size=1, - stride=1, - padding=0, - if_act=True, - act=act, - name=name + "_expand", - ) - self.bottleneck_conv = ConvBNLayer( - in_channels=mid_channels, - out_channels=mid_channels, - kernel_size=kernel_size, - stride=stride, - padding=int((kernel_size - 1) // 2), - groups=mid_channels, - if_act=True, - act=act, - name=name + "_depthwise", - ) - if self.if_se: - self.mid_se = SEModule(mid_channels, name=name + "_se") - self.linear_conv = ConvBNLayer( - in_channels=mid_channels, - out_channels=out_channels, - kernel_size=1, - stride=1, - padding=0, - if_act=False, - act=None, - name=name + "_linear", - ) - - def forward(self, inputs): - x = self.expand_conv(inputs) - x = self.bottleneck_conv(x) - if self.if_se: - x = self.mid_se(x) - x = self.linear_conv(x) - if self.if_shortcut: - x = inputs + x - return x - - -class MobileNetV3(nn.Module): - def __init__( - self, in_channels=3, model_name="large", scale=0.5, disable_se=False, **kwargs - ): - """ - the MobilenetV3 backbone network for detection module. - Args: - params(dict): the super parameters for build network - """ - super(MobileNetV3, self).__init__() - - self.disable_se = disable_se - - if model_name == "large": - cfg = [ - # k, exp, c, se, nl, s, - [3, 16, 16, False, "relu", 1], - [3, 64, 24, False, "relu", 2], - [3, 72, 24, False, "relu", 1], - [5, 72, 40, True, "relu", 2], - [5, 120, 40, True, "relu", 1], - [5, 120, 40, True, "relu", 1], - [3, 240, 80, False, "hard_swish", 2], - [3, 200, 80, False, "hard_swish", 1], - [3, 184, 80, False, "hard_swish", 1], - [3, 184, 80, False, "hard_swish", 1], - [3, 480, 112, True, "hard_swish", 1], - [3, 672, 112, True, "hard_swish", 1], - [5, 672, 160, True, "hard_swish", 2], - [5, 960, 160, True, "hard_swish", 1], - [5, 960, 160, True, "hard_swish", 1], - ] - cls_ch_squeeze = 960 - elif model_name == "small": - cfg = [ - # k, exp, c, se, nl, s, - [3, 16, 16, True, "relu", 2], - [3, 72, 24, False, "relu", 2], - [3, 88, 24, False, "relu", 1], - [5, 96, 40, True, "hard_swish", 2], - [5, 240, 40, True, "hard_swish", 1], - [5, 240, 40, True, "hard_swish", 1], - [5, 120, 48, True, "hard_swish", 1], - [5, 144, 48, True, "hard_swish", 1], - [5, 288, 96, True, "hard_swish", 2], - [5, 576, 96, True, "hard_swish", 1], - [5, 576, 96, True, "hard_swish", 1], - ] - cls_ch_squeeze = 576 - else: - raise NotImplementedError( - "mode[" + model_name + "_model] is not implemented!" - ) - - supported_scale = [0.35, 0.5, 0.75, 1.0, 1.25] - assert ( - scale in supported_scale - ), "supported scale are {} but input scale is {}".format(supported_scale, scale) - inplanes = 16 - # conv1 - self.conv = ConvBNLayer( - in_channels=in_channels, - out_channels=make_divisible(inplanes * scale), - kernel_size=3, - stride=2, - padding=1, - groups=1, - if_act=True, - act="hard_swish", - name="conv1", - ) - - self.stages = nn.ModuleList() - self.out_channels = [] - block_list = [] - i = 0 - inplanes = make_divisible(inplanes * scale) - for k, exp, c, se, nl, s in cfg: - se = se and not self.disable_se - if s == 2 and i > 2: - self.out_channels.append(inplanes) - self.stages.append(nn.Sequential(*block_list)) - block_list = [] - block_list.append( - ResidualUnit( - in_channels=inplanes, - mid_channels=make_divisible(scale * exp), - out_channels=make_divisible(scale * c), - kernel_size=k, - stride=s, - use_se=se, - act=nl, - name="conv" + str(i + 2), - ) - ) - inplanes = make_divisible(scale * c) - i += 1 - block_list.append( - ConvBNLayer( - in_channels=inplanes, - out_channels=make_divisible(scale * cls_ch_squeeze), - kernel_size=1, - stride=1, - padding=0, - groups=1, - if_act=True, - act="hard_swish", - name="conv_last", - ) - ) - self.stages.append(nn.Sequential(*block_list)) - self.out_channels.append(make_divisible(scale * cls_ch_squeeze)) - # for i, stage in enumerate(self.stages): - # self.add_sublayer(sublayer=stage, name="stage{}".format(i)) - - def forward(self, x): - x = self.conv(x) - out_list = [] - for stage in self.stages: - x = stage(x) - out_list.append(x) - return out_list diff --git a/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/backbones/rec_hgnet.py b/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/backbones/rec_hgnet.py deleted file mode 100644 index c1515a712a10c3c925d54d53a99c0f7e67453c9f..0000000000000000000000000000000000000000 --- a/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/backbones/rec_hgnet.py +++ /dev/null @@ -1,290 +0,0 @@ -import torch -import torch.nn.functional as F -from torch import nn - - -class ConvBNAct(nn.Module): - def __init__( - self, in_channels, out_channels, kernel_size, stride, groups=1, use_act=True - ): - super().__init__() - self.use_act = use_act - self.conv = nn.Conv2d( - in_channels, - out_channels, - kernel_size, - stride, - padding=(kernel_size - 1) // 2, - groups=groups, - bias=False, - ) - self.bn = nn.BatchNorm2d(out_channels) - if self.use_act: - self.act = nn.ReLU() - - def forward(self, x): - x = self.conv(x) - x = self.bn(x) - if self.use_act: - x = self.act(x) - return x - - -class ESEModule(nn.Module): - def __init__(self, channels): - super().__init__() - self.avg_pool = nn.AdaptiveAvgPool2d(1) - self.conv = nn.Conv2d( - in_channels=channels, - out_channels=channels, - kernel_size=1, - stride=1, - padding=0, - ) - self.sigmoid = nn.Sigmoid() - - def forward(self, x): - identity = x - x = self.avg_pool(x) - x = self.conv(x) - x = self.sigmoid(x) - return x * identity - - -class HG_Block(nn.Module): - def __init__( - self, - in_channels, - mid_channels, - out_channels, - layer_num, - identity=False, - ): - super().__init__() - self.identity = identity - - self.layers = nn.ModuleList() - self.layers.append( - ConvBNAct( - in_channels=in_channels, - out_channels=mid_channels, - kernel_size=3, - stride=1, - ) - ) - for _ in range(layer_num - 1): - self.layers.append( - ConvBNAct( - in_channels=mid_channels, - out_channels=mid_channels, - kernel_size=3, - stride=1, - ) - ) - - # feature aggregation - total_channels = in_channels + layer_num * mid_channels - self.aggregation_conv = ConvBNAct( - in_channels=total_channels, - out_channels=out_channels, - kernel_size=1, - stride=1, - ) - self.att = ESEModule(out_channels) - - def forward(self, x): - identity = x - output = [] - output.append(x) - for layer in self.layers: - x = layer(x) - output.append(x) - x = torch.cat(output, dim=1) - x = self.aggregation_conv(x) - x = self.att(x) - if self.identity: - x += identity - return x - - -class HG_Stage(nn.Module): - def __init__( - self, - in_channels, - mid_channels, - out_channels, - block_num, - layer_num, - downsample=True, - stride=[2, 1], - ): - super().__init__() - self.downsample = downsample - if downsample: - self.downsample = ConvBNAct( - in_channels=in_channels, - out_channels=in_channels, - kernel_size=3, - stride=stride, - groups=in_channels, - use_act=False, - ) - - blocks_list = [] - blocks_list.append( - HG_Block(in_channels, mid_channels, out_channels, layer_num, identity=False) - ) - for _ in range(block_num - 1): - blocks_list.append( - HG_Block( - out_channels, mid_channels, out_channels, layer_num, identity=True - ) - ) - self.blocks = nn.Sequential(*blocks_list) - - def forward(self, x): - if self.downsample: - x = self.downsample(x) - x = self.blocks(x) - return x - - -class PPHGNet(nn.Module): - """ - PPHGNet - Args: - stem_channels: list. Stem channel list of PPHGNet. - stage_config: dict. The configuration of each stage of PPHGNet. such as the number of channels, stride, etc. - layer_num: int. Number of layers of HG_Block. - use_last_conv: boolean. Whether to use a 1x1 convolutional layer before the classification layer. - class_expand: int=2048. Number of channels for the last 1x1 convolutional layer. - dropout_prob: float. Parameters of dropout, 0.0 means dropout is not used. - class_num: int=1000. The number of classes. - Returns: - model: nn.Layer. Specific PPHGNet model depends on args. - """ - - def __init__( - self, - stem_channels, - stage_config, - layer_num, - in_channels=3, - det=False, - out_indices=None, - ): - super().__init__() - self.det = det - self.out_indices = out_indices if out_indices is not None else [0, 1, 2, 3] - - # stem - stem_channels.insert(0, in_channels) - self.stem = nn.Sequential( - *[ - ConvBNAct( - in_channels=stem_channels[i], - out_channels=stem_channels[i + 1], - kernel_size=3, - stride=2 if i == 0 else 1, - ) - for i in range(len(stem_channels) - 1) - ] - ) - - if self.det: - self.pool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1) - # stages - self.stages = nn.ModuleList() - self.out_channels = [] - for block_id, k in enumerate(stage_config): - ( - in_channels, - mid_channels, - out_channels, - block_num, - downsample, - stride, - ) = stage_config[k] - self.stages.append( - HG_Stage( - in_channels, - mid_channels, - out_channels, - block_num, - layer_num, - downsample, - stride, - ) - ) - if block_id in self.out_indices: - self.out_channels.append(out_channels) - - if not self.det: - self.out_channels = stage_config["stage4"][2] - - self._init_weights() - - def _init_weights(self): - for m in self.modules(): - if isinstance(m, nn.Conv2d): - nn.init.kaiming_normal_(m.weight) - elif isinstance(m, nn.BatchNorm2d): - nn.init.ones_(m.weight) - nn.init.zeros_(m.bias) - elif isinstance(m, nn.Linear): - nn.init.zeros_(m.bias) - - def forward(self, x): - x = self.stem(x) - if self.det: - x = self.pool(x) - - out = [] - for i, stage in enumerate(self.stages): - x = stage(x) - if self.det and i in self.out_indices: - out.append(x) - if self.det: - return out - - if self.training: - x = F.adaptive_avg_pool2d(x, [1, 40]) - else: - x = F.avg_pool2d(x, [3, 2]) - return x - - -def PPHGNet_small(pretrained=False, use_ssld=False, det=False, **kwargs): - """ - PPHGNet_small - Args: - pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise. - If str, means the path of the pretrained model. - use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True. - Returns: - model: nn.Layer. Specific `PPHGNet_small` model depends on args. - """ - stage_config_det = { - # in_channels, mid_channels, out_channels, blocks, downsample - "stage1": [128, 128, 256, 1, False, 2], - "stage2": [256, 160, 512, 1, True, 2], - "stage3": [512, 192, 768, 2, True, 2], - "stage4": [768, 224, 1024, 1, True, 2], - } - - stage_config_rec = { - # in_channels, mid_channels, out_channels, blocks, downsample - "stage1": [128, 128, 256, 1, True, [2, 1]], - "stage2": [256, 160, 512, 1, True, [1, 2]], - "stage3": [512, 192, 768, 2, True, [2, 1]], - "stage4": [768, 224, 1024, 1, True, [2, 1]], - } - - model = PPHGNet( - stem_channels=[64, 64, 128], - stage_config=stage_config_det if det else stage_config_rec, - layer_num=6, - det=det, - **kwargs - ) - return model diff --git a/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/backbones/rec_lcnetv3.py b/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/backbones/rec_lcnetv3.py deleted file mode 100644 index e2bd4572a767560c0a0250aec64fae0c9bdaee2c..0000000000000000000000000000000000000000 --- a/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/backbones/rec_lcnetv3.py +++ /dev/null @@ -1,516 +0,0 @@ -# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import absolute_import, division, print_function - -import torch -import torch.nn.functional as F -from torch import nn - -from ..common import Activation - -NET_CONFIG_det = { - "blocks2": - # k, in_c, out_c, s, use_se - [[3, 16, 32, 1, False]], - "blocks3": [[3, 32, 64, 2, False], [3, 64, 64, 1, False]], - "blocks4": [[3, 64, 128, 2, False], [3, 128, 128, 1, False]], - "blocks5": [ - [3, 128, 256, 2, False], - [5, 256, 256, 1, False], - [5, 256, 256, 1, False], - [5, 256, 256, 1, False], - [5, 256, 256, 1, False], - ], - "blocks6": [ - [5, 256, 512, 2, True], - [5, 512, 512, 1, True], - [5, 512, 512, 1, False], - [5, 512, 512, 1, False], - ], -} - -NET_CONFIG_rec = { - "blocks2": - # k, in_c, out_c, s, use_se - [[3, 16, 32, 1, False]], - "blocks3": [[3, 32, 64, 1, False], [3, 64, 64, 1, False]], - "blocks4": [[3, 64, 128, (2, 1), False], [3, 128, 128, 1, False]], - "blocks5": [ - [3, 128, 256, (1, 2), False], - [5, 256, 256, 1, False], - [5, 256, 256, 1, False], - [5, 256, 256, 1, False], - [5, 256, 256, 1, False], - ], - "blocks6": [ - [5, 256, 512, (2, 1), True], - [5, 512, 512, 1, True], - [5, 512, 512, (2, 1), False], - [5, 512, 512, 1, False], - ], -} - - -def make_divisible(v, divisor=16, min_value=None): - if min_value is None: - min_value = divisor - new_v = max(min_value, int(v + divisor / 2) // divisor * divisor) - if new_v < 0.9 * v: - new_v += divisor - return new_v - - -class LearnableAffineBlock(nn.Module): - def __init__(self, scale_value=1.0, bias_value=0.0, lr_mult=1.0, lab_lr=0.1): - super().__init__() - self.scale = nn.Parameter(torch.Tensor([scale_value])) - self.bias = nn.Parameter(torch.Tensor([bias_value])) - - def forward(self, x): - return self.scale * x + self.bias - - -class ConvBNLayer(nn.Module): - def __init__( - self, in_channels, out_channels, kernel_size, stride, groups=1, lr_mult=1.0 - ): - super().__init__() - self.conv = nn.Conv2d( - in_channels=in_channels, - out_channels=out_channels, - kernel_size=kernel_size, - stride=stride, - padding=(kernel_size - 1) // 2, - groups=groups, - bias=False, - ) - - self.bn = nn.BatchNorm2d( - out_channels, - ) - - def forward(self, x): - x = self.conv(x) - x = self.bn(x) - return x - - -class Act(nn.Module): - def __init__(self, act="hswish", lr_mult=1.0, lab_lr=0.1): - super().__init__() - if act == "hswish": - self.act = nn.Hardswish(inplace=True) - else: - assert act == "relu" - self.act = Activation(act) - self.lab = LearnableAffineBlock(lr_mult=lr_mult, lab_lr=lab_lr) - - def forward(self, x): - return self.lab(self.act(x)) - - -class LearnableRepLayer(nn.Module): - def __init__( - self, - in_channels, - out_channels, - kernel_size, - stride=1, - groups=1, - num_conv_branches=1, - lr_mult=1.0, - lab_lr=0.1, - ): - super().__init__() - self.is_repped = False - self.groups = groups - self.stride = stride - self.kernel_size = kernel_size - self.in_channels = in_channels - self.out_channels = out_channels - self.num_conv_branches = num_conv_branches - self.padding = (kernel_size - 1) // 2 - - self.identity = ( - nn.BatchNorm2d( - num_features=in_channels, - ) - if out_channels == in_channels and stride == 1 - else None - ) - - self.conv_kxk = nn.ModuleList( - [ - ConvBNLayer( - in_channels, - out_channels, - kernel_size, - stride, - groups=groups, - lr_mult=lr_mult, - ) - for _ in range(self.num_conv_branches) - ] - ) - - self.conv_1x1 = ( - ConvBNLayer( - in_channels, out_channels, 1, stride, groups=groups, lr_mult=lr_mult - ) - if kernel_size > 1 - else None - ) - - self.lab = LearnableAffineBlock(lr_mult=lr_mult, lab_lr=lab_lr) - self.act = Act(lr_mult=lr_mult, lab_lr=lab_lr) - - def forward(self, x): - # for export - if self.is_repped: - out = self.lab(self.reparam_conv(x)) - if self.stride != 2: - out = self.act(out) - return out - - out = 0 - if self.identity is not None: - out += self.identity(x) - - if self.conv_1x1 is not None: - out += self.conv_1x1(x) - - for conv in self.conv_kxk: - out += conv(x) - - out = self.lab(out) - if self.stride != 2: - out = self.act(out) - return out - - def rep(self): - if self.is_repped: - return - kernel, bias = self._get_kernel_bias() - self.reparam_conv = nn.Conv2d( - in_channels=self.in_channels, - out_channels=self.out_channels, - kernel_size=self.kernel_size, - stride=self.stride, - padding=self.padding, - groups=self.groups, - ) - self.reparam_conv.weight.data = kernel - self.reparam_conv.bias.data = bias - self.is_repped = True - - def _pad_kernel_1x1_to_kxk(self, kernel1x1, pad): - if not isinstance(kernel1x1, torch.Tensor): - return 0 - else: - return nn.functional.pad(kernel1x1, [pad, pad, pad, pad]) - - def _get_kernel_bias(self): - kernel_conv_1x1, bias_conv_1x1 = self._fuse_bn_tensor(self.conv_1x1) - kernel_conv_1x1 = self._pad_kernel_1x1_to_kxk( - kernel_conv_1x1, self.kernel_size // 2 - ) - - kernel_identity, bias_identity = self._fuse_bn_tensor(self.identity) - - kernel_conv_kxk = 0 - bias_conv_kxk = 0 - for conv in self.conv_kxk: - kernel, bias = self._fuse_bn_tensor(conv) - kernel_conv_kxk += kernel - bias_conv_kxk += bias - - kernel_reparam = kernel_conv_kxk + kernel_conv_1x1 + kernel_identity - bias_reparam = bias_conv_kxk + bias_conv_1x1 + bias_identity - return kernel_reparam, bias_reparam - - def _fuse_bn_tensor(self, branch): - if not branch: - return 0, 0 - elif isinstance(branch, ConvBNLayer): - kernel = branch.conv.weight - running_mean = branch.bn._mean - running_var = branch.bn._variance - gamma = branch.bn.weight - beta = branch.bn.bias - eps = branch.bn._epsilon - else: - assert isinstance(branch, nn.BatchNorm2d) - if not hasattr(self, "id_tensor"): - input_dim = self.in_channels // self.groups - kernel_value = torch.zeros( - (self.in_channels, input_dim, self.kernel_size, self.kernel_size), - dtype=branch.weight.dtype, - ) - for i in range(self.in_channels): - kernel_value[ - i, i % input_dim, self.kernel_size // 2, self.kernel_size // 2 - ] = 1 - self.id_tensor = kernel_value - kernel = self.id_tensor - running_mean = branch._mean - running_var = branch._variance - gamma = branch.weight - beta = branch.bias - eps = branch._epsilon - std = (running_var + eps).sqrt() - t = (gamma / std).reshape((-1, 1, 1, 1)) - return kernel * t, beta - running_mean * gamma / std - - -class SELayer(nn.Module): - def __init__(self, channel, reduction=4, lr_mult=1.0): - super().__init__() - self.avg_pool = nn.AdaptiveAvgPool2d(1) - self.conv1 = nn.Conv2d( - in_channels=channel, - out_channels=channel // reduction, - kernel_size=1, - stride=1, - padding=0, - ) - self.relu = nn.ReLU() - self.conv2 = nn.Conv2d( - in_channels=channel // reduction, - out_channels=channel, - kernel_size=1, - stride=1, - padding=0, - ) - self.hardsigmoid = nn.Hardsigmoid(inplace=True) - - def forward(self, x): - identity = x - x = self.avg_pool(x) - x = self.conv1(x) - x = self.relu(x) - x = self.conv2(x) - x = self.hardsigmoid(x) - x = identity * x - return x - - -class LCNetV3Block(nn.Module): - def __init__( - self, - in_channels, - out_channels, - stride, - dw_size, - use_se=False, - conv_kxk_num=4, - lr_mult=1.0, - lab_lr=0.1, - ): - super().__init__() - self.use_se = use_se - self.dw_conv = LearnableRepLayer( - in_channels=in_channels, - out_channels=in_channels, - kernel_size=dw_size, - stride=stride, - groups=in_channels, - num_conv_branches=conv_kxk_num, - lr_mult=lr_mult, - lab_lr=lab_lr, - ) - if use_se: - self.se = SELayer(in_channels, lr_mult=lr_mult) - self.pw_conv = LearnableRepLayer( - in_channels=in_channels, - out_channels=out_channels, - kernel_size=1, - stride=1, - num_conv_branches=conv_kxk_num, - lr_mult=lr_mult, - lab_lr=lab_lr, - ) - - def forward(self, x): - x = self.dw_conv(x) - if self.use_se: - x = self.se(x) - x = self.pw_conv(x) - return x - - -class PPLCNetV3(nn.Module): - def __init__( - self, - scale=1.0, - conv_kxk_num=4, - lr_mult_list=[1.0, 1.0, 1.0, 1.0, 1.0, 1.0], - lab_lr=0.1, - det=False, - **kwargs - ): - super().__init__() - self.scale = scale - self.lr_mult_list = lr_mult_list - self.det = det - - self.net_config = NET_CONFIG_det if self.det else NET_CONFIG_rec - - assert isinstance( - self.lr_mult_list, (list, tuple) - ), "lr_mult_list should be in (list, tuple) but got {}".format( - type(self.lr_mult_list) - ) - assert ( - len(self.lr_mult_list) == 6 - ), "lr_mult_list length should be 6 but got {}".format(len(self.lr_mult_list)) - - self.conv1 = ConvBNLayer( - in_channels=3, - out_channels=make_divisible(16 * scale), - kernel_size=3, - stride=2, - lr_mult=self.lr_mult_list[0], - ) - - self.blocks2 = nn.Sequential( - *[ - LCNetV3Block( - in_channels=make_divisible(in_c * scale), - out_channels=make_divisible(out_c * scale), - dw_size=k, - stride=s, - use_se=se, - conv_kxk_num=conv_kxk_num, - lr_mult=self.lr_mult_list[1], - lab_lr=lab_lr, - ) - for i, (k, in_c, out_c, s, se) in enumerate(self.net_config["blocks2"]) - ] - ) - - self.blocks3 = nn.Sequential( - *[ - LCNetV3Block( - in_channels=make_divisible(in_c * scale), - out_channels=make_divisible(out_c * scale), - dw_size=k, - stride=s, - use_se=se, - conv_kxk_num=conv_kxk_num, - lr_mult=self.lr_mult_list[2], - lab_lr=lab_lr, - ) - for i, (k, in_c, out_c, s, se) in enumerate(self.net_config["blocks3"]) - ] - ) - - self.blocks4 = nn.Sequential( - *[ - LCNetV3Block( - in_channels=make_divisible(in_c * scale), - out_channels=make_divisible(out_c * scale), - dw_size=k, - stride=s, - use_se=se, - conv_kxk_num=conv_kxk_num, - lr_mult=self.lr_mult_list[3], - lab_lr=lab_lr, - ) - for i, (k, in_c, out_c, s, se) in enumerate(self.net_config["blocks4"]) - ] - ) - - self.blocks5 = nn.Sequential( - *[ - LCNetV3Block( - in_channels=make_divisible(in_c * scale), - out_channels=make_divisible(out_c * scale), - dw_size=k, - stride=s, - use_se=se, - conv_kxk_num=conv_kxk_num, - lr_mult=self.lr_mult_list[4], - lab_lr=lab_lr, - ) - for i, (k, in_c, out_c, s, se) in enumerate(self.net_config["blocks5"]) - ] - ) - - self.blocks6 = nn.Sequential( - *[ - LCNetV3Block( - in_channels=make_divisible(in_c * scale), - out_channels=make_divisible(out_c * scale), - dw_size=k, - stride=s, - use_se=se, - conv_kxk_num=conv_kxk_num, - lr_mult=self.lr_mult_list[5], - lab_lr=lab_lr, - ) - for i, (k, in_c, out_c, s, se) in enumerate(self.net_config["blocks6"]) - ] - ) - self.out_channels = make_divisible(512 * scale) - - if self.det: - mv_c = [16, 24, 56, 480] - self.out_channels = [ - make_divisible(self.net_config["blocks3"][-1][2] * scale), - make_divisible(self.net_config["blocks4"][-1][2] * scale), - make_divisible(self.net_config["blocks5"][-1][2] * scale), - make_divisible(self.net_config["blocks6"][-1][2] * scale), - ] - - self.layer_list = nn.ModuleList( - [ - nn.Conv2d(self.out_channels[0], int(mv_c[0] * scale), 1, 1, 0), - nn.Conv2d(self.out_channels[1], int(mv_c[1] * scale), 1, 1, 0), - nn.Conv2d(self.out_channels[2], int(mv_c[2] * scale), 1, 1, 0), - nn.Conv2d(self.out_channels[3], int(mv_c[3] * scale), 1, 1, 0), - ] - ) - self.out_channels = [ - int(mv_c[0] * scale), - int(mv_c[1] * scale), - int(mv_c[2] * scale), - int(mv_c[3] * scale), - ] - - def forward(self, x): - out_list = [] - x = self.conv1(x) - x = self.blocks2(x) - x = self.blocks3(x) - out_list.append(x) - x = self.blocks4(x) - out_list.append(x) - x = self.blocks5(x) - out_list.append(x) - x = self.blocks6(x) - out_list.append(x) - - if self.det: - out_list[0] = self.layer_list[0](out_list[0]) - out_list[1] = self.layer_list[1](out_list[1]) - out_list[2] = self.layer_list[2](out_list[2]) - out_list[3] = self.layer_list[3](out_list[3]) - return out_list - - if self.training: - x = F.adaptive_avg_pool2d(x, [1, 40]) - else: - x = F.avg_pool2d(x, [3, 2]) - return x diff --git a/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/backbones/rec_mobilenet_v3.py b/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/backbones/rec_mobilenet_v3.py deleted file mode 100644 index d284a6d49a2b4abfab285643aa849b9e6bf2db37..0000000000000000000000000000000000000000 --- a/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/backbones/rec_mobilenet_v3.py +++ /dev/null @@ -1,136 +0,0 @@ -from torch import nn - -from .det_mobilenet_v3 import ConvBNLayer, ResidualUnit, make_divisible - - -class MobileNetV3(nn.Module): - def __init__( - self, - in_channels=3, - model_name="small", - scale=0.5, - large_stride=None, - small_stride=None, - **kwargs - ): - super(MobileNetV3, self).__init__() - if small_stride is None: - small_stride = [2, 2, 2, 2] - if large_stride is None: - large_stride = [1, 2, 2, 2] - - assert isinstance( - large_stride, list - ), "large_stride type must " "be list but got {}".format(type(large_stride)) - assert isinstance( - small_stride, list - ), "small_stride type must " "be list but got {}".format(type(small_stride)) - assert ( - len(large_stride) == 4 - ), "large_stride length must be " "4 but got {}".format(len(large_stride)) - assert ( - len(small_stride) == 4 - ), "small_stride length must be " "4 but got {}".format(len(small_stride)) - - if model_name == "large": - cfg = [ - # k, exp, c, se, nl, s, - [3, 16, 16, False, "relu", large_stride[0]], - [3, 64, 24, False, "relu", (large_stride[1], 1)], - [3, 72, 24, False, "relu", 1], - [5, 72, 40, True, "relu", (large_stride[2], 1)], - [5, 120, 40, True, "relu", 1], - [5, 120, 40, True, "relu", 1], - [3, 240, 80, False, "hard_swish", 1], - [3, 200, 80, False, "hard_swish", 1], - [3, 184, 80, False, "hard_swish", 1], - [3, 184, 80, False, "hard_swish", 1], - [3, 480, 112, True, "hard_swish", 1], - [3, 672, 112, True, "hard_swish", 1], - [5, 672, 160, True, "hard_swish", (large_stride[3], 1)], - [5, 960, 160, True, "hard_swish", 1], - [5, 960, 160, True, "hard_swish", 1], - ] - cls_ch_squeeze = 960 - elif model_name == "small": - cfg = [ - # k, exp, c, se, nl, s, - [3, 16, 16, True, "relu", (small_stride[0], 1)], - [3, 72, 24, False, "relu", (small_stride[1], 1)], - [3, 88, 24, False, "relu", 1], - [5, 96, 40, True, "hard_swish", (small_stride[2], 1)], - [5, 240, 40, True, "hard_swish", 1], - [5, 240, 40, True, "hard_swish", 1], - [5, 120, 48, True, "hard_swish", 1], - [5, 144, 48, True, "hard_swish", 1], - [5, 288, 96, True, "hard_swish", (small_stride[3], 1)], - [5, 576, 96, True, "hard_swish", 1], - [5, 576, 96, True, "hard_swish", 1], - ] - cls_ch_squeeze = 576 - else: - raise NotImplementedError( - "mode[" + model_name + "_model] is not implemented!" - ) - - supported_scale = [0.35, 0.5, 0.75, 1.0, 1.25] - assert ( - scale in supported_scale - ), "supported scales are {} but input scale is {}".format( - supported_scale, scale - ) - - inplanes = 16 - # conv1 - self.conv1 = ConvBNLayer( - in_channels=in_channels, - out_channels=make_divisible(inplanes * scale), - kernel_size=3, - stride=2, - padding=1, - groups=1, - if_act=True, - act="hard_swish", - name="conv1", - ) - i = 0 - block_list = [] - inplanes = make_divisible(inplanes * scale) - for k, exp, c, se, nl, s in cfg: - block_list.append( - ResidualUnit( - in_channels=inplanes, - mid_channels=make_divisible(scale * exp), - out_channels=make_divisible(scale * c), - kernel_size=k, - stride=s, - use_se=se, - act=nl, - name="conv" + str(i + 2), - ) - ) - inplanes = make_divisible(scale * c) - i += 1 - self.blocks = nn.Sequential(*block_list) - - self.conv2 = ConvBNLayer( - in_channels=inplanes, - out_channels=make_divisible(scale * cls_ch_squeeze), - kernel_size=1, - stride=1, - padding=0, - groups=1, - if_act=True, - act="hard_swish", - name="conv_last", - ) - - self.pool = nn.MaxPool2d(kernel_size=2, stride=2, padding=0) - self.out_channels = make_divisible(scale * cls_ch_squeeze) - - def forward(self, x): - x = self.conv1(x) - x = self.blocks(x) - x = self.conv2(x) - x = self.pool(x) - return x diff --git a/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/backbones/rec_mv1_enhance.py b/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/backbones/rec_mv1_enhance.py deleted file mode 100644 index 447c48f6554c69fec68b77de25e0386cba4aaca8..0000000000000000000000000000000000000000 --- a/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/backbones/rec_mv1_enhance.py +++ /dev/null @@ -1,234 +0,0 @@ -import os, sys -import torch -import torch.nn as nn -import torch.nn.functional as F - -from ..common import Activation - - -class ConvBNLayer(nn.Module): - def __init__(self, - num_channels, - filter_size, - num_filters, - stride, - padding, - channels=None, - num_groups=1, - act='hard_swish'): - super(ConvBNLayer, self).__init__() - self.act = act - self._conv = nn.Conv2d( - in_channels=num_channels, - out_channels=num_filters, - kernel_size=filter_size, - stride=stride, - padding=padding, - groups=num_groups, - bias=False) - - self._batch_norm = nn.BatchNorm2d( - num_filters, - ) - if self.act is not None: - self._act = Activation(act_type=act, inplace=True) - - def forward(self, inputs): - y = self._conv(inputs) - y = self._batch_norm(y) - if self.act is not None: - y = self._act(y) - return y - - -class DepthwiseSeparable(nn.Module): - def __init__(self, - num_channels, - num_filters1, - num_filters2, - num_groups, - stride, - scale, - dw_size=3, - padding=1, - use_se=False): - super(DepthwiseSeparable, self).__init__() - self.use_se = use_se - self._depthwise_conv = ConvBNLayer( - num_channels=num_channels, - num_filters=int(num_filters1 * scale), - filter_size=dw_size, - stride=stride, - padding=padding, - num_groups=int(num_groups * scale)) - if use_se: - self._se = SEModule(int(num_filters1 * scale)) - self._pointwise_conv = ConvBNLayer( - num_channels=int(num_filters1 * scale), - filter_size=1, - num_filters=int(num_filters2 * scale), - stride=1, - padding=0) - - def forward(self, inputs): - y = self._depthwise_conv(inputs) - if self.use_se: - y = self._se(y) - y = self._pointwise_conv(y) - return y - - -class MobileNetV1Enhance(nn.Module): - def __init__(self, - in_channels=3, - scale=0.5, - last_conv_stride=1, - last_pool_type='max', - **kwargs): - super().__init__() - self.scale = scale - self.block_list = [] - - self.conv1 = ConvBNLayer( - num_channels=in_channels, - filter_size=3, - channels=3, - num_filters=int(32 * scale), - stride=2, - padding=1) - - conv2_1 = DepthwiseSeparable( - num_channels=int(32 * scale), - num_filters1=32, - num_filters2=64, - num_groups=32, - stride=1, - scale=scale) - self.block_list.append(conv2_1) - - conv2_2 = DepthwiseSeparable( - num_channels=int(64 * scale), - num_filters1=64, - num_filters2=128, - num_groups=64, - stride=1, - scale=scale) - self.block_list.append(conv2_2) - - conv3_1 = DepthwiseSeparable( - num_channels=int(128 * scale), - num_filters1=128, - num_filters2=128, - num_groups=128, - stride=1, - scale=scale) - self.block_list.append(conv3_1) - - conv3_2 = DepthwiseSeparable( - num_channels=int(128 * scale), - num_filters1=128, - num_filters2=256, - num_groups=128, - stride=(2, 1), - scale=scale) - self.block_list.append(conv3_2) - - conv4_1 = DepthwiseSeparable( - num_channels=int(256 * scale), - num_filters1=256, - num_filters2=256, - num_groups=256, - stride=1, - scale=scale) - self.block_list.append(conv4_1) - - conv4_2 = DepthwiseSeparable( - num_channels=int(256 * scale), - num_filters1=256, - num_filters2=512, - num_groups=256, - stride=(2, 1), - scale=scale) - self.block_list.append(conv4_2) - - for _ in range(5): - conv5 = DepthwiseSeparable( - num_channels=int(512 * scale), - num_filters1=512, - num_filters2=512, - num_groups=512, - stride=1, - dw_size=5, - padding=2, - scale=scale, - use_se=False) - self.block_list.append(conv5) - - conv5_6 = DepthwiseSeparable( - num_channels=int(512 * scale), - num_filters1=512, - num_filters2=1024, - num_groups=512, - stride=(2, 1), - dw_size=5, - padding=2, - scale=scale, - use_se=True) - self.block_list.append(conv5_6) - - conv6 = DepthwiseSeparable( - num_channels=int(1024 * scale), - num_filters1=1024, - num_filters2=1024, - num_groups=1024, - stride=last_conv_stride, - dw_size=5, - padding=2, - use_se=True, - scale=scale) - self.block_list.append(conv6) - - self.block_list = nn.Sequential(*self.block_list) - if last_pool_type == 'avg': - self.pool = nn.AvgPool2d(kernel_size=2, stride=2, padding=0) - else: - self.pool = nn.MaxPool2d(kernel_size=2, stride=2, padding=0) - self.out_channels = int(1024 * scale) - - def forward(self, inputs): - y = self.conv1(inputs) - y = self.block_list(y) - y = self.pool(y) - return y - -def hardsigmoid(x): - return F.relu6(x + 3., inplace=True) / 6. - -class SEModule(nn.Module): - def __init__(self, channel, reduction=4): - super(SEModule, self).__init__() - self.avg_pool = nn.AdaptiveAvgPool2d(1) - self.conv1 = nn.Conv2d( - in_channels=channel, - out_channels=channel // reduction, - kernel_size=1, - stride=1, - padding=0, - bias=True) - self.conv2 = nn.Conv2d( - in_channels=channel // reduction, - out_channels=channel, - kernel_size=1, - stride=1, - padding=0, - bias=True) - - def forward(self, inputs): - outputs = self.avg_pool(inputs) - outputs = self.conv1(outputs) - outputs = F.relu(outputs) - outputs = self.conv2(outputs) - outputs = hardsigmoid(outputs) - x = torch.mul(inputs, outputs) - - return x diff --git a/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/backbones/rec_pphgnetv2.py b/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/backbones/rec_pphgnetv2.py deleted file mode 100644 index 390ca4c61b4b7fd1635d5229d5ef3d79fc3509fe..0000000000000000000000000000000000000000 --- a/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/backbones/rec_pphgnetv2.py +++ /dev/null @@ -1,810 +0,0 @@ -import math -import torch -import torch.nn as nn -import torch.nn.functional as F - - -class AdaptiveAvgPool2D(nn.AdaptiveAvgPool2d): - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - - if isinstance(self.output_size, int) and self.output_size == 1: - self._gap = True - elif ( - isinstance(self.output_size, tuple) - and self.output_size[0] == 1 - and self.output_size[1] == 1 - ): - self._gap = True - else: - self._gap = False - - def forward(self, x): - if self._gap: - # Global Average Pooling - N, C, _, _ = x.shape - x_mean = torch.mean(x, dim=[2, 3]) - x_mean = torch.reshape(x_mean, [N, C, 1, 1]) - return x_mean - else: - return F.adaptive_avg_pool2d( - x, - output_size=self.output_size - ) - -class LearnableAffineBlock(nn.Module): - """ - Create a learnable affine block module. This module can significantly improve accuracy on smaller models. - - Args: - scale_value (float): The initial value of the scale parameter, default is 1.0. - bias_value (float): The initial value of the bias parameter, default is 0.0. - lr_mult (float): The learning rate multiplier, default is 1.0. - lab_lr (float): The learning rate, default is 0.01. - """ - - def __init__(self, scale_value=1.0, bias_value=0.0, lr_mult=1.0, lab_lr=0.01): - super().__init__() - self.scale = nn.Parameter(torch.Tensor([scale_value])) - self.bias = nn.Parameter(torch.Tensor([bias_value])) - - def forward(self, x): - return self.scale * x + self.bias - - -class ConvBNAct(nn.Module): - """ - ConvBNAct is a combination of convolution and batchnorm layers. - - Args: - in_channels (int): Number of input channels. - out_channels (int): Number of output channels. - kernel_size (int): Size of the convolution kernel. Defaults to 3. - stride (int): Stride of the convolution. Defaults to 1. - padding (int/str): Padding or padding type for the convolution. Defaults to 1. - groups (int): Number of groups for the convolution. Defaults to 1. - use_act: (bool): Whether to use activation function. Defaults to True. - use_lab (bool): Whether to use the LAB operation. Defaults to False. - lr_mult (float): Learning rate multiplier for the layer. Defaults to 1.0. - """ - - def __init__( - self, - in_channels, - out_channels, - kernel_size=3, - stride=1, - padding=1, - groups=1, - use_act=True, - use_lab=False, - lr_mult=1.0, - ): - super().__init__() - self.use_act = use_act - self.use_lab = use_lab - - self.conv = nn.Conv2d( - in_channels, - out_channels, - kernel_size, - stride, - padding=padding if isinstance(padding, str) else (kernel_size - 1) // 2, - # padding=(kernel_size - 1) // 2, - groups=groups, - bias=False, - ) - self.bn = nn.BatchNorm2d( - out_channels, - ) - if self.use_act: - self.act = nn.ReLU() - if self.use_lab: - self.lab = LearnableAffineBlock(lr_mult=lr_mult) - - def forward(self, x): - x = self.conv(x) - x = self.bn(x) - if self.use_act: - x = self.act(x) - if self.use_lab: - x = self.lab(x) - return x - - -class LightConvBNAct(nn.Module): - """ - LightConvBNAct is a combination of pw and dw layers. - - Args: - in_channels (int): Number of input channels. - out_channels (int): Number of output channels. - kernel_size (int): Size of the depth-wise convolution kernel. - use_lab (bool): Whether to use the LAB operation. Defaults to False. - lr_mult (float): Learning rate multiplier for the layer. Defaults to 1.0. - """ - - def __init__( - self, - in_channels, - out_channels, - kernel_size, - use_lab=False, - lr_mult=1.0, - **kwargs, - ): - super().__init__() - self.conv1 = ConvBNAct( - in_channels=in_channels, - out_channels=out_channels, - kernel_size=1, - use_act=False, - use_lab=use_lab, - lr_mult=lr_mult, - ) - self.conv2 = ConvBNAct( - in_channels=out_channels, - out_channels=out_channels, - kernel_size=kernel_size, - groups=out_channels, - use_act=True, - use_lab=use_lab, - lr_mult=lr_mult, - ) - - def forward(self, x): - x = self.conv1(x) - x = self.conv2(x) - return x - - -class CustomMaxPool2d(nn.Module): - def __init__( - self, - kernel_size, - stride=None, - padding=0, - dilation=1, - return_indices=False, - ceil_mode=False, - data_format="NCHW", - ): - super(CustomMaxPool2d, self).__init__() - self.kernel_size = kernel_size if isinstance(kernel_size, (tuple, list)) else (kernel_size, kernel_size) - self.stride = stride if stride is not None else self.kernel_size - self.stride = self.stride if isinstance(self.stride, (tuple, list)) else (self.stride, self.stride) - self.dilation = dilation if isinstance(dilation, (tuple, list)) else (dilation, dilation) - self.return_indices = return_indices - self.ceil_mode = ceil_mode - self.padding_mode = padding - - # 当padding不是"same"时使用标准MaxPool2d - if padding != "same": - self.padding = padding if isinstance(padding, (tuple, list)) else (padding, padding) - self.pool = nn.MaxPool2d( - kernel_size=self.kernel_size, - stride=self.stride, - padding=self.padding, - dilation=self.dilation, - return_indices=self.return_indices, - ceil_mode=self.ceil_mode - ) - - def forward(self, x): - # 处理same padding - if self.padding_mode == "same": - input_height, input_width = x.size(2), x.size(3) - - # 计算期望的输出尺寸 - out_height = math.ceil(input_height / self.stride[0]) - out_width = math.ceil(input_width / self.stride[1]) - - # 计算需要的padding - pad_height = max((out_height - 1) * self.stride[0] + self.kernel_size[0] - input_height, 0) - pad_width = max((out_width - 1) * self.stride[1] + self.kernel_size[1] - input_width, 0) - - # 将padding分配到两边 - pad_top = pad_height // 2 - pad_bottom = pad_height - pad_top - pad_left = pad_width // 2 - pad_right = pad_width - pad_left - - # 应用padding - x = F.pad(x, (pad_left, pad_right, pad_top, pad_bottom)) - - # 使用标准max_pool2d函数 - if self.return_indices: - return F.max_pool2d_with_indices( - x, - kernel_size=self.kernel_size, - stride=self.stride, - padding=0, # 已经手动pad过了 - dilation=self.dilation, - ceil_mode=self.ceil_mode - ) - else: - return F.max_pool2d( - x, - kernel_size=self.kernel_size, - stride=self.stride, - padding=0, # 已经手动pad过了 - dilation=self.dilation, - ceil_mode=self.ceil_mode - ) - else: - # 使用预定义的MaxPool2d - return self.pool(x) - -class StemBlock(nn.Module): - """ - StemBlock for PP-HGNetV2. - - Args: - in_channels (int): Number of input channels. - mid_channels (int): Number of middle channels. - out_channels (int): Number of output channels. - use_lab (bool): Whether to use the LAB operation. Defaults to False. - lr_mult (float): Learning rate multiplier for the layer. Defaults to 1.0. - """ - - def __init__( - self, - in_channels, - mid_channels, - out_channels, - use_lab=False, - lr_mult=1.0, - text_rec=False, - ): - super().__init__() - self.stem1 = ConvBNAct( - in_channels=in_channels, - out_channels=mid_channels, - kernel_size=3, - stride=2, - use_lab=use_lab, - lr_mult=lr_mult, - ) - self.stem2a = ConvBNAct( - in_channels=mid_channels, - out_channels=mid_channels // 2, - kernel_size=2, - stride=1, - padding="same", - use_lab=use_lab, - lr_mult=lr_mult, - ) - self.stem2b = ConvBNAct( - in_channels=mid_channels // 2, - out_channels=mid_channels, - kernel_size=2, - stride=1, - padding="same", - use_lab=use_lab, - lr_mult=lr_mult, - ) - self.stem3 = ConvBNAct( - in_channels=mid_channels * 2, - out_channels=mid_channels, - kernel_size=3, - stride=1 if text_rec else 2, - use_lab=use_lab, - lr_mult=lr_mult, - ) - self.stem4 = ConvBNAct( - in_channels=mid_channels, - out_channels=out_channels, - kernel_size=1, - stride=1, - use_lab=use_lab, - lr_mult=lr_mult, - ) - self.pool = CustomMaxPool2d( - kernel_size=2, stride=1, ceil_mode=True, padding="same" - ) - # self.pool = nn.MaxPool2d( - # kernel_size=2, stride=1, ceil_mode=True, padding=1 - # ) - - def forward(self, x): - x = self.stem1(x) - x2 = self.stem2a(x) - x2 = self.stem2b(x2) - x1 = self.pool(x) - - # if x1.shape[2:] != x2.shape[2:]: - # x1 = F.interpolate(x1, size=x2.shape[2:], mode='bilinear', align_corners=False) - - x = torch.cat([x1, x2], 1) - x = self.stem3(x) - x = self.stem4(x) - - return x - - -class HGV2_Block(nn.Module): - """ - HGV2_Block, the basic unit that constitutes the HGV2_Stage. - - Args: - in_channels (int): Number of input channels. - mid_channels (int): Number of middle channels. - out_channels (int): Number of output channels. - kernel_size (int): Size of the convolution kernel. Defaults to 3. - layer_num (int): Number of layers in the HGV2 block. Defaults to 6. - stride (int): Stride of the convolution. Defaults to 1. - padding (int/str): Padding or padding type for the convolution. Defaults to 1. - groups (int): Number of groups for the convolution. Defaults to 1. - use_act (bool): Whether to use activation function. Defaults to True. - use_lab (bool): Whether to use the LAB operation. Defaults to False. - lr_mult (float): Learning rate multiplier for the layer. Defaults to 1.0. - """ - - def __init__( - self, - in_channels, - mid_channels, - out_channels, - kernel_size=3, - layer_num=6, - identity=False, - light_block=True, - use_lab=False, - lr_mult=1.0, - ): - super().__init__() - self.identity = identity - - self.layers = nn.ModuleList() - block_type = "LightConvBNAct" if light_block else "ConvBNAct" - for i in range(layer_num): - self.layers.append( - eval(block_type)( - in_channels=in_channels if i == 0 else mid_channels, - out_channels=mid_channels, - stride=1, - kernel_size=kernel_size, - use_lab=use_lab, - lr_mult=lr_mult, - ) - ) - # feature aggregation - total_channels = in_channels + layer_num * mid_channels - self.aggregation_squeeze_conv = ConvBNAct( - in_channels=total_channels, - out_channels=out_channels // 2, - kernel_size=1, - stride=1, - use_lab=use_lab, - lr_mult=lr_mult, - ) - self.aggregation_excitation_conv = ConvBNAct( - in_channels=out_channels // 2, - out_channels=out_channels, - kernel_size=1, - stride=1, - use_lab=use_lab, - lr_mult=lr_mult, - ) - - def forward(self, x): - identity = x - output = [] - output.append(x) - for layer in self.layers: - x = layer(x) - output.append(x) - x = torch.cat(output, dim=1) - x = self.aggregation_squeeze_conv(x) - x = self.aggregation_excitation_conv(x) - if self.identity: - x += identity - return x - - -class HGV2_Stage(nn.Module): - """ - HGV2_Stage, the basic unit that constitutes the PPHGNetV2. - - Args: - in_channels (int): Number of input channels. - mid_channels (int): Number of middle channels. - out_channels (int): Number of output channels. - block_num (int): Number of blocks in the HGV2 stage. - layer_num (int): Number of layers in the HGV2 block. Defaults to 6. - is_downsample (bool): Whether to use downsampling operation. Defaults to False. - light_block (bool): Whether to use light block. Defaults to True. - kernel_size (int): Size of the convolution kernel. Defaults to 3. - use_lab (bool, optional): Whether to use the LAB operation. Defaults to False. - lr_mult (float, optional): Learning rate multiplier for the layer. Defaults to 1.0. - """ - - def __init__( - self, - in_channels, - mid_channels, - out_channels, - block_num, - layer_num=6, - is_downsample=True, - light_block=True, - kernel_size=3, - use_lab=False, - stride=2, - lr_mult=1.0, - ): - - super().__init__() - self.is_downsample = is_downsample - if self.is_downsample: - self.downsample = ConvBNAct( - in_channels=in_channels, - out_channels=in_channels, - kernel_size=3, - stride=stride, - groups=in_channels, - use_act=False, - use_lab=use_lab, - lr_mult=lr_mult, - ) - - blocks_list = [] - for i in range(block_num): - blocks_list.append( - HGV2_Block( - in_channels=in_channels if i == 0 else out_channels, - mid_channels=mid_channels, - out_channels=out_channels, - kernel_size=kernel_size, - layer_num=layer_num, - identity=False if i == 0 else True, - light_block=light_block, - use_lab=use_lab, - lr_mult=lr_mult, - ) - ) - self.blocks = nn.Sequential(*blocks_list) - - def forward(self, x): - if self.is_downsample: - x = self.downsample(x) - x = self.blocks(x) - return x - - -class DropoutInferDownscale(nn.Module): - """ - 实现与Paddle的mode="downscale_in_infer"等效的Dropout - 训练模式:out = input * mask(直接应用掩码,不进行放大) - 推理模式:out = input * (1.0 - p)(在推理时按概率缩小) - """ - - def __init__(self, p=0.5): - super().__init__() - self.p = p - - def forward(self, x): - if self.training: - # 训练时:应用随机mask但不放大 - return F.dropout(x, self.p, training=True) * (1.0 - self.p) - else: - # 推理时:按照dropout概率缩小输出 - return x * (1.0 - self.p) - -class PPHGNetV2(nn.Module): - """ - PPHGNetV2 - - Args: - stage_config (dict): Config for PPHGNetV2 stages. such as the number of channels, stride, etc. - stem_channels: (list): Number of channels of the stem of the PPHGNetV2. - use_lab (bool): Whether to use the LAB operation. Defaults to False. - use_last_conv (bool): Whether to use the last conv layer as the output channel. Defaults to True. - class_expand (int): Number of channels for the last 1x1 convolutional layer. - drop_prob (float): Dropout probability for the last 1x1 convolutional layer. Defaults to 0.0. - class_num (int): The number of classes for the classification layer. Defaults to 1000. - lr_mult_list (list): Learning rate multiplier for the stages. Defaults to [1.0, 1.0, 1.0, 1.0, 1.0]. - Returns: - model: nn.Layer. Specific PPHGNetV2 model depends on args. - """ - - def __init__( - self, - stage_config, - stem_channels=[3, 32, 64], - use_lab=False, - use_last_conv=True, - class_expand=2048, - dropout_prob=0.0, - class_num=1000, - lr_mult_list=[1.0, 1.0, 1.0, 1.0, 1.0], - det=False, - text_rec=False, - out_indices=None, - **kwargs, - ): - super().__init__() - self.det = det - self.text_rec = text_rec - self.use_lab = use_lab - self.use_last_conv = use_last_conv - self.class_expand = class_expand - self.class_num = class_num - self.out_indices = out_indices if out_indices is not None else [0, 1, 2, 3] - self.out_channels = [] - - # stem - self.stem = StemBlock( - in_channels=stem_channels[0], - mid_channels=stem_channels[1], - out_channels=stem_channels[2], - use_lab=use_lab, - lr_mult=lr_mult_list[0], - text_rec=text_rec, - ) - - # stages - self.stages = nn.ModuleList() - for i, k in enumerate(stage_config): - ( - in_channels, - mid_channels, - out_channels, - block_num, - is_downsample, - light_block, - kernel_size, - layer_num, - stride, - ) = stage_config[k] - self.stages.append( - HGV2_Stage( - in_channels, - mid_channels, - out_channels, - block_num, - layer_num, - is_downsample, - light_block, - kernel_size, - use_lab, - stride, - lr_mult=lr_mult_list[i + 1], - ) - ) - if i in self.out_indices: - self.out_channels.append(out_channels) - if not self.det: - self.out_channels = stage_config["stage4"][2] - - self.avg_pool = AdaptiveAvgPool2D(1) - - if self.use_last_conv: - self.last_conv = nn.Conv2d( - in_channels=out_channels, - out_channels=self.class_expand, - kernel_size=1, - stride=1, - padding=0, - bias=False, - ) - self.act = nn.ReLU() - if self.use_lab: - self.lab = LearnableAffineBlock() - self.dropout = DropoutInferDownscale(p=dropout_prob) - - self.flatten = nn.Flatten(start_dim=1, end_dim=-1) - if not self.det: - self.fc = nn.Linear( - self.class_expand if self.use_last_conv else out_channels, - self.class_num, - ) - - self._init_weights() - - def _init_weights(self): - for m in self.modules(): - if isinstance(m, nn.Conv2d): - nn.init.kaiming_normal_(m.weight) - elif isinstance(m, nn.BatchNorm2d): - nn.init.ones_(m.weight) - nn.init.zeros_(m.bias) - elif isinstance(m, nn.Linear): - nn.init.zeros_(m.bias) - - def forward(self, x): - x = self.stem(x) - out = [] - for i, stage in enumerate(self.stages): - x = stage(x) - if self.det and i in self.out_indices: - out.append(x) - if self.det: - return out - - if self.text_rec: - if self.training: - x = F.adaptive_avg_pool2d(x, [1, 40]) - else: - x = F.avg_pool2d(x, [3, 2]) - return x - - -def PPHGNetV2_B0(pretrained=False, use_ssld=False, **kwargs): - """ - PPHGNetV2_B0 - Args: - pretrained (bool/str): If `True` load pretrained parameters, `False` otherwise. - If str, means the path of the pretrained model. - use_ssld (bool) Whether using ssld pretrained model when pretrained is True. - Returns: - model: nn.Layer. Specific `PPHGNetV2_B0` model depends on args. - """ - stage_config = { - # in_channels, mid_channels, out_channels, num_blocks, is_downsample, light_block, kernel_size, layer_num - "stage1": [16, 16, 64, 1, False, False, 3, 3], - "stage2": [64, 32, 256, 1, True, False, 3, 3], - "stage3": [256, 64, 512, 2, True, True, 5, 3], - "stage4": [512, 128, 1024, 1, True, True, 5, 3], - } - - model = PPHGNetV2( - stem_channels=[3, 16, 16], stage_config=stage_config, use_lab=True, **kwargs - ) - return model - - -def PPHGNetV2_B1(pretrained=False, use_ssld=False, **kwargs): - """ - PPHGNetV2_B1 - Args: - pretrained (bool/str): If `True` load pretrained parameters, `False` otherwise. - If str, means the path of the pretrained model. - use_ssld (bool) Whether using ssld pretrained model when pretrained is True. - Returns: - model: nn.Layer. Specific `PPHGNetV2_B1` model depends on args. - """ - stage_config = { - # in_channels, mid_channels, out_channels, num_blocks, is_downsample, light_block, kernel_size, layer_num - "stage1": [32, 32, 64, 1, False, False, 3, 3], - "stage2": [64, 48, 256, 1, True, False, 3, 3], - "stage3": [256, 96, 512, 2, True, True, 5, 3], - "stage4": [512, 192, 1024, 1, True, True, 5, 3], - } - - model = PPHGNetV2( - stem_channels=[3, 24, 32], stage_config=stage_config, use_lab=True, **kwargs - ) - return model - - -def PPHGNetV2_B2(pretrained=False, use_ssld=False, **kwargs): - """ - PPHGNetV2_B2 - Args: - pretrained (bool/str): If `True` load pretrained parameters, `False` otherwise. - If str, means the path of the pretrained model. - use_ssld (bool) Whether using ssld pretrained model when pretrained is True. - Returns: - model: nn.Layer. Specific `PPHGNetV2_B2` model depends on args. - """ - stage_config = { - # in_channels, mid_channels, out_channels, num_blocks, is_downsample, light_block, kernel_size, layer_num - "stage1": [32, 32, 96, 1, False, False, 3, 4], - "stage2": [96, 64, 384, 1, True, False, 3, 4], - "stage3": [384, 128, 768, 3, True, True, 5, 4], - "stage4": [768, 256, 1536, 1, True, True, 5, 4], - } - - model = PPHGNetV2( - stem_channels=[3, 24, 32], stage_config=stage_config, use_lab=True, **kwargs - ) - return model - - -def PPHGNetV2_B3(pretrained=False, use_ssld=False, **kwargs): - """ - PPHGNetV2_B3 - Args: - pretrained (bool/str): If `True` load pretrained parameters, `False` otherwise. - If str, means the path of the pretrained model. - use_ssld (bool) Whether using ssld pretrained model when pretrained is True. - Returns: - model: nn.Layer. Specific `PPHGNetV2_B3` model depends on args. - """ - stage_config = { - # in_channels, mid_channels, out_channels, num_blocks, is_downsample, light_block, kernel_size, layer_num - "stage1": [32, 32, 128, 1, False, False, 3, 5], - "stage2": [128, 64, 512, 1, True, False, 3, 5], - "stage3": [512, 128, 1024, 3, True, True, 5, 5], - "stage4": [1024, 256, 2048, 1, True, True, 5, 5], - } - - model = PPHGNetV2( - stem_channels=[3, 24, 32], stage_config=stage_config, use_lab=True, **kwargs - ) - return model - - -def PPHGNetV2_B4(pretrained=False, use_ssld=False, det=False, text_rec=False, **kwargs): - """ - PPHGNetV2_B4 - Args: - pretrained (bool/str): If `True` load pretrained parameters, `False` otherwise. - If str, means the path of the pretrained model. - use_ssld (bool) Whether using ssld pretrained model when pretrained is True. - Returns: - model: nn.Layer. Specific `PPHGNetV2_B4` model depends on args. - """ - stage_config_rec = { - # in_channels, mid_channels, out_channels, num_blocks, is_downsample, light_block, kernel_size, layer_num, stride - "stage1": [48, 48, 128, 1, True, False, 3, 6, [2, 1]], - "stage2": [128, 96, 512, 1, True, False, 3, 6, [1, 2]], - "stage3": [512, 192, 1024, 3, True, True, 5, 6, [2, 1]], - "stage4": [1024, 384, 2048, 1, True, True, 5, 6, [2, 1]], - } - - stage_config_det = { - # in_channels, mid_channels, out_channels, num_blocks, is_downsample, light_block, kernel_size, layer_num - "stage1": [48, 48, 128, 1, False, False, 3, 6, 2], - "stage2": [128, 96, 512, 1, True, False, 3, 6, 2], - "stage3": [512, 192, 1024, 3, True, True, 5, 6, 2], - "stage4": [1024, 384, 2048, 1, True, True, 5, 6, 2], - } - model = PPHGNetV2( - stem_channels=[3, 32, 48], - stage_config=stage_config_det if det else stage_config_rec, - use_lab=False, - det=det, - text_rec=text_rec, - **kwargs, - ) - return model - - -def PPHGNetV2_B5(pretrained=False, use_ssld=False, **kwargs): - """ - PPHGNetV2_B5 - Args: - pretrained (bool/str): If `True` load pretrained parameters, `False` otherwise. - If str, means the path of the pretrained model. - use_ssld (bool) Whether using ssld pretrained model when pretrained is True. - Returns: - model: nn.Layer. Specific `PPHGNetV2_B5` model depends on args. - """ - stage_config = { - # in_channels, mid_channels, out_channels, num_blocks, is_downsample, light_block, kernel_size, layer_num - "stage1": [64, 64, 128, 1, False, False, 3, 6], - "stage2": [128, 128, 512, 2, True, False, 3, 6], - "stage3": [512, 256, 1024, 5, True, True, 5, 6], - "stage4": [1024, 512, 2048, 2, True, True, 5, 6], - } - - model = PPHGNetV2( - stem_channels=[3, 32, 64], stage_config=stage_config, use_lab=False, **kwargs - ) - return model - - -def PPHGNetV2_B6(pretrained=False, use_ssld=False, **kwargs): - """ - PPHGNetV2_B6 - Args: - pretrained (bool/str): If `True` load pretrained parameters, `False` otherwise. - If str, means the path of the pretrained model. - use_ssld (bool) Whether using ssld pretrained model when pretrained is True. - Returns: - model: nn.Layer. Specific `PPHGNetV2_B6` model depends on args. - """ - stage_config = { - # in_channels, mid_channels, out_channels, num_blocks, is_downsample, light_block, kernel_size, layer_num - "stage1": [96, 96, 192, 2, False, False, 3, 6], - "stage2": [192, 192, 512, 3, True, False, 3, 6], - "stage3": [512, 384, 1024, 6, True, True, 5, 6], - "stage4": [1024, 768, 2048, 3, True, True, 5, 6], - } - - model = PPHGNetV2( - stem_channels=[3, 48, 96], stage_config=stage_config, use_lab=False, **kwargs - ) - return model diff --git a/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/backbones/rec_svtrnet.py b/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/backbones/rec_svtrnet.py deleted file mode 100644 index 3a117736d9456723055a83e5e0195267d1be513a..0000000000000000000000000000000000000000 --- a/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/backbones/rec_svtrnet.py +++ /dev/null @@ -1,638 +0,0 @@ -import numpy as np -import torch -from torch import nn - -from ..common import Activation - - -def drop_path(x, drop_prob=0.0, training=False): - """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks). - the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper... - See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... - """ - if drop_prob == 0.0 or not training: - return x - keep_prob = torch.as_tensor(1 - drop_prob) - shape = (x.shape[0],) + (1,) * (x.ndim - 1) - random_tensor = keep_prob + torch.rand(shape, dtype=x.dtype) - random_tensor = torch.floor(random_tensor) # binarize - output = x.divide(keep_prob) * random_tensor - return output - - -class ConvBNLayer(nn.Module): - def __init__( - self, - in_channels, - out_channels, - kernel_size=3, - stride=1, - padding=0, - bias_attr=False, - groups=1, - act="gelu", - ): - super().__init__() - self.conv = nn.Conv2d( - in_channels=in_channels, - out_channels=out_channels, - kernel_size=kernel_size, - stride=stride, - padding=padding, - groups=groups, - bias=bias_attr, - ) - self.norm = nn.BatchNorm2d(out_channels) - self.act = Activation(act_type=act, inplace=True) - - def forward(self, inputs): - out = self.conv(inputs) - out = self.norm(out) - out = self.act(out) - return out - - -class DropPath(nn.Module): - """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).""" - - def __init__(self, drop_prob=None): - super(DropPath, self).__init__() - self.drop_prob = drop_prob - - def forward(self, x): - return drop_path(x, self.drop_prob, self.training) - - -class Identity(nn.Module): - def __init__(self): - super(Identity, self).__init__() - - def forward(self, input): - return input - - -class Mlp(nn.Module): - def __init__( - self, - in_features, - hidden_features=None, - out_features=None, - act_layer="gelu", - drop=0.0, - ): - super().__init__() - out_features = out_features or in_features - hidden_features = hidden_features or in_features - self.fc1 = nn.Linear(in_features, hidden_features) - self.act = Activation(act_type=act_layer, inplace=True) - self.fc2 = nn.Linear(hidden_features, out_features) - self.drop = nn.Dropout(drop) - - def forward(self, x): - x = self.fc1(x) - x = self.act(x) - x = self.drop(x) - x = self.fc2(x) - x = self.drop(x) - return x - - -class ConvMixer(nn.Module): - def __init__( - self, - dim, - num_heads=8, - HW=[8, 25], - local_k=[3, 3], - ): - super().__init__() - self.HW = HW - self.dim = dim - self.local_mixer = nn.Conv2d( - dim, - dim, - local_k, - 1, - [local_k[0] // 2, local_k[1] // 2], - groups=num_heads, - ) - - def forward(self, x): - h = self.HW[0] - w = self.HW[1] - x = x.transpose([0, 2, 1]).reshape([0, self.dim, h, w]) - x = self.local_mixer(x) - x = x.flatten(2).permute(0, 2, 1) - return x - - -class Attention(nn.Module): - def __init__( - self, - dim, - num_heads=8, - mixer="Global", - HW=[8, 25], - local_k=[7, 11], - qkv_bias=False, - qk_scale=None, - attn_drop=0.0, - proj_drop=0.0, - ): - super().__init__() - self.num_heads = num_heads - head_dim = dim // num_heads - self.scale = qk_scale or head_dim**-0.5 - self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias) - self.attn_drop = nn.Dropout(attn_drop) - self.proj = nn.Linear(dim, dim) - self.proj_drop = nn.Dropout(proj_drop) - self.HW = HW - if HW is not None: - H = HW[0] - W = HW[1] - self.N = H * W - self.C = dim - if mixer == "Local" and HW is not None: - hk = local_k[0] - wk = local_k[1] - mask = torch.ones(H * W, H + hk - 1, W + wk - 1, dtype=torch.float32) - for h in range(0, H): - for w in range(0, W): - mask[h * W + w, h : h + hk, w : w + wk] = 0.0 - mask_paddle = mask[:, hk // 2 : H + hk // 2, wk // 2 : W + wk // 2].flatten( - 1 - ) - mask_inf = torch.full( - [H * W, H * W], fill_value=float("-Inf"), dtype=torch.float32 - ) - mask = torch.where(mask_paddle < 1, mask_paddle, mask_inf) - self.mask = mask.unsqueeze(0).unsqueeze(1) - # self.mask = mask[None, None, :] - self.mixer = mixer - - def forward(self, x): - if self.HW is not None: - N = self.N - C = self.C - else: - _, N, C = x.shape - qkv = self.qkv(x) - qkv = qkv.reshape((-1, N, 3, self.num_heads, C // self.num_heads)).permute( - 2, 0, 3, 1, 4 - ) - q, k, v = qkv[0] * self.scale, qkv[1], qkv[2] - - attn = q.matmul(k.permute(0, 1, 3, 2)) - if self.mixer == "Local": - attn += self.mask - attn = nn.functional.softmax(attn, dim=-1) - attn = self.attn_drop(attn) - - x = (attn.matmul(v)).permute(0, 2, 1, 3).reshape((-1, N, C)) - x = self.proj(x) - x = self.proj_drop(x) - return x - - -class Block(nn.Module): - def __init__( - self, - dim, - num_heads, - mixer="Global", - local_mixer=[7, 11], - HW=None, - mlp_ratio=4.0, - qkv_bias=False, - qk_scale=None, - drop=0.0, - attn_drop=0.0, - drop_path=0.0, - act_layer="gelu", - norm_layer="nn.LayerNorm", - epsilon=1e-6, - prenorm=True, - ): - super().__init__() - if isinstance(norm_layer, str): - self.norm1 = eval(norm_layer)(dim, eps=epsilon) - else: - self.norm1 = norm_layer(dim) - if mixer == "Global" or mixer == "Local": - self.mixer = Attention( - dim, - num_heads=num_heads, - mixer=mixer, - HW=HW, - local_k=local_mixer, - qkv_bias=qkv_bias, - qk_scale=qk_scale, - attn_drop=attn_drop, - proj_drop=drop, - ) - elif mixer == "Conv": - self.mixer = ConvMixer(dim, num_heads=num_heads, HW=HW, local_k=local_mixer) - else: - raise TypeError("The mixer must be one of [Global, Local, Conv]") - - self.drop_path = DropPath(drop_path) if drop_path > 0.0 else Identity() - if isinstance(norm_layer, str): - self.norm2 = eval(norm_layer)(dim, eps=epsilon) - else: - self.norm2 = norm_layer(dim) - mlp_hidden_dim = int(dim * mlp_ratio) - self.mlp_ratio = mlp_ratio - self.mlp = Mlp( - in_features=dim, - hidden_features=mlp_hidden_dim, - act_layer=act_layer, - drop=drop, - ) - self.prenorm = prenorm - - def forward(self, x): - if self.prenorm: - x = self.norm1(x + self.drop_path(self.mixer(x))) - x = self.norm2(x + self.drop_path(self.mlp(x))) - else: - x = x + self.drop_path(self.mixer(self.norm1(x))) - x = x + self.drop_path(self.mlp(self.norm2(x))) - return x - - -class PatchEmbed(nn.Module): - """Image to Patch Embedding""" - - def __init__( - self, - img_size=[32, 100], - in_channels=3, - embed_dim=768, - sub_num=2, - patch_size=[4, 4], - mode="pope", - ): - super().__init__() - num_patches = (img_size[1] // (2**sub_num)) * (img_size[0] // (2**sub_num)) - self.img_size = img_size - self.num_patches = num_patches - self.embed_dim = embed_dim - self.norm = None - if mode == "pope": - if sub_num == 2: - self.proj = nn.Sequential( - ConvBNLayer( - in_channels=in_channels, - out_channels=embed_dim // 2, - kernel_size=3, - stride=2, - padding=1, - act="gelu", - bias_attr=True, - ), - ConvBNLayer( - in_channels=embed_dim // 2, - out_channels=embed_dim, - kernel_size=3, - stride=2, - padding=1, - act="gelu", - bias_attr=True, - ), - ) - if sub_num == 3: - self.proj = nn.Sequential( - ConvBNLayer( - in_channels=in_channels, - out_channels=embed_dim // 4, - kernel_size=3, - stride=2, - padding=1, - act="gelu", - bias_attr=True, - ), - ConvBNLayer( - in_channels=embed_dim // 4, - out_channels=embed_dim // 2, - kernel_size=3, - stride=2, - padding=1, - act="gelu", - bias_attr=True, - ), - ConvBNLayer( - in_channels=embed_dim // 2, - out_channels=embed_dim, - kernel_size=3, - stride=2, - padding=1, - act="gelu", - bias_attr=True, - ), - ) - elif mode == "linear": - self.proj = nn.Conv2d( - 1, embed_dim, kernel_size=patch_size, stride=patch_size - ) - self.num_patches = ( - img_size[0] // patch_size[0] * img_size[1] // patch_size[1] - ) - - def forward(self, x): - B, C, H, W = x.shape - assert ( - H == self.img_size[0] and W == self.img_size[1] - ), "Input image size ({}*{}) doesn't match model ({}*{}).".format( - H, W, self.img_size[0], self.img_size[1] - ) - x = self.proj(x).flatten(2).permute(0, 2, 1) - return x - - -class SubSample(nn.Module): - def __init__( - self, - in_channels, - out_channels, - types="Pool", - stride=[2, 1], - sub_norm="nn.LayerNorm", - act=None, - ): - super().__init__() - self.types = types - if types == "Pool": - self.avgpool = nn.AvgPool2d( - kernel_size=[3, 5], stride=stride, padding=[1, 2] - ) - self.maxpool = nn.MaxPool2d( - kernel_size=[3, 5], stride=stride, padding=[1, 2] - ) - self.proj = nn.Linear(in_channels, out_channels) - else: - self.conv = nn.Conv2d( - in_channels, - out_channels, - kernel_size=3, - stride=stride, - padding=1, - ) - self.norm = eval(sub_norm)(out_channels) - if act is not None: - self.act = act() - else: - self.act = None - - def forward(self, x): - if self.types == "Pool": - x1 = self.avgpool(x) - x2 = self.maxpool(x) - x = (x1 + x2) * 0.5 - out = self.proj(x.flatten(2).permute(0, 2, 1)) - else: - x = self.conv(x) - out = x.flatten(2).permute(0, 2, 1) - out = self.norm(out) - if self.act is not None: - out = self.act(out) - - return out - - -class SVTRNet(nn.Module): - def __init__( - self, - img_size=[32, 100], - in_channels=3, - embed_dim=[64, 128, 256], - depth=[3, 6, 3], - num_heads=[2, 4, 8], - mixer=["Local"] * 6 + ["Global"] * 6, # Local atten, Global atten, Conv - local_mixer=[[7, 11], [7, 11], [7, 11]], - patch_merging="Conv", # Conv, Pool, None - mlp_ratio=4, - qkv_bias=True, - qk_scale=None, - drop_rate=0.0, - last_drop=0.0, - attn_drop_rate=0.0, - drop_path_rate=0.1, - norm_layer="nn.LayerNorm", - sub_norm="nn.LayerNorm", - epsilon=1e-6, - out_channels=192, - out_char_num=25, - block_unit="Block", - act="gelu", - last_stage=True, - sub_num=2, - prenorm=True, - use_lenhead=False, - **kwargs - ): - super().__init__() - self.img_size = img_size - self.embed_dim = embed_dim - self.out_channels = out_channels - self.prenorm = prenorm - patch_merging = ( - None - if patch_merging != "Conv" and patch_merging != "Pool" - else patch_merging - ) - self.patch_embed = PatchEmbed( - img_size=img_size, - in_channels=in_channels, - embed_dim=embed_dim[0], - sub_num=sub_num, - ) - num_patches = self.patch_embed.num_patches - self.HW = [img_size[0] // (2**sub_num), img_size[1] // (2**sub_num)] - self.pos_embed = nn.Parameter(torch.zeros(1, num_patches, embed_dim[0])) - self.pos_drop = nn.Dropout(p=drop_rate) - Block_unit = eval(block_unit) - - dpr = np.linspace(0, drop_path_rate, sum(depth)) - self.blocks1 = nn.ModuleList( - [ - Block_unit( - dim=embed_dim[0], - num_heads=num_heads[0], - mixer=mixer[0 : depth[0]][i], - HW=self.HW, - local_mixer=local_mixer[0], - mlp_ratio=mlp_ratio, - qkv_bias=qkv_bias, - qk_scale=qk_scale, - drop=drop_rate, - act_layer=act, - attn_drop=attn_drop_rate, - drop_path=dpr[0 : depth[0]][i], - norm_layer=norm_layer, - epsilon=epsilon, - prenorm=prenorm, - ) - for i in range(depth[0]) - ] - ) - if patch_merging is not None: - self.sub_sample1 = SubSample( - embed_dim[0], - embed_dim[1], - sub_norm=sub_norm, - stride=[2, 1], - types=patch_merging, - ) - HW = [self.HW[0] // 2, self.HW[1]] - else: - HW = self.HW - self.patch_merging = patch_merging - self.blocks2 = nn.ModuleList( - [ - Block_unit( - dim=embed_dim[1], - num_heads=num_heads[1], - mixer=mixer[depth[0] : depth[0] + depth[1]][i], - HW=HW, - local_mixer=local_mixer[1], - mlp_ratio=mlp_ratio, - qkv_bias=qkv_bias, - qk_scale=qk_scale, - drop=drop_rate, - act_layer=act, - attn_drop=attn_drop_rate, - drop_path=dpr[depth[0] : depth[0] + depth[1]][i], - norm_layer=norm_layer, - epsilon=epsilon, - prenorm=prenorm, - ) - for i in range(depth[1]) - ] - ) - if patch_merging is not None: - self.sub_sample2 = SubSample( - embed_dim[1], - embed_dim[2], - sub_norm=sub_norm, - stride=[2, 1], - types=patch_merging, - ) - HW = [self.HW[0] // 4, self.HW[1]] - else: - HW = self.HW - self.blocks3 = nn.ModuleList( - [ - Block_unit( - dim=embed_dim[2], - num_heads=num_heads[2], - mixer=mixer[depth[0] + depth[1] :][i], - HW=HW, - local_mixer=local_mixer[2], - mlp_ratio=mlp_ratio, - qkv_bias=qkv_bias, - qk_scale=qk_scale, - drop=drop_rate, - act_layer=act, - attn_drop=attn_drop_rate, - drop_path=dpr[depth[0] + depth[1] :][i], - norm_layer=norm_layer, - epsilon=epsilon, - prenorm=prenorm, - ) - for i in range(depth[2]) - ] - ) - self.last_stage = last_stage - if last_stage: - self.avg_pool = nn.AdaptiveAvgPool2d([1, out_char_num]) - self.last_conv = nn.Conv2d( - in_channels=embed_dim[2], - out_channels=self.out_channels, - kernel_size=1, - stride=1, - padding=0, - bias=False, - ) - self.hardswish = Activation("hard_swish", inplace=True) # nn.Hardswish() - # self.dropout = nn.Dropout(p=last_drop, mode="downscale_in_infer") - self.dropout = nn.Dropout(p=last_drop) - if not prenorm: - self.norm = eval(norm_layer)(embed_dim[-1], eps=epsilon) - self.use_lenhead = use_lenhead - if use_lenhead: - self.len_conv = nn.Linear(embed_dim[2], self.out_channels) - self.hardswish_len = Activation( - "hard_swish", inplace=True - ) # nn.Hardswish() - self.dropout_len = nn.Dropout(p=last_drop) - - torch.nn.init.xavier_normal_(self.pos_embed) - self.apply(self._init_weights) - - def _init_weights(self, m): - # weight initialization - if isinstance(m, nn.Conv2d): - nn.init.kaiming_normal_(m.weight, mode="fan_out") - if m.bias is not None: - nn.init.zeros_(m.bias) - elif isinstance(m, nn.BatchNorm2d): - nn.init.ones_(m.weight) - nn.init.zeros_(m.bias) - elif isinstance(m, nn.Linear): - nn.init.normal_(m.weight, 0, 0.01) - if m.bias is not None: - nn.init.zeros_(m.bias) - elif isinstance(m, nn.ConvTranspose2d): - nn.init.kaiming_normal_(m.weight, mode="fan_out") - if m.bias is not None: - nn.init.zeros_(m.bias) - elif isinstance(m, nn.LayerNorm): - nn.init.ones_(m.weight) - nn.init.zeros_(m.bias) - - def forward_features(self, x): - x = self.patch_embed(x) - x = x + self.pos_embed - x = self.pos_drop(x) - for blk in self.blocks1: - x = blk(x) - if self.patch_merging is not None: - x = self.sub_sample1( - x.permute(0, 2, 1).reshape( - [-1, self.embed_dim[0], self.HW[0], self.HW[1]] - ) - ) - for blk in self.blocks2: - x = blk(x) - if self.patch_merging is not None: - x = self.sub_sample2( - x.permute(0, 2, 1).reshape( - [-1, self.embed_dim[1], self.HW[0] // 2, self.HW[1]] - ) - ) - for blk in self.blocks3: - x = blk(x) - if not self.prenorm: - x = self.norm(x) - return x - - def forward(self, x): - x = self.forward_features(x) - if self.use_lenhead: - len_x = self.len_conv(x.mean(1)) - len_x = self.dropout_len(self.hardswish_len(len_x)) - if self.last_stage: - if self.patch_merging is not None: - h = self.HW[0] // 4 - else: - h = self.HW[0] - x = self.avg_pool( - x.permute(0, 2, 1).reshape([-1, self.embed_dim[2], h, self.HW[1]]) - ) - x = self.last_conv(x) - x = self.hardswish(x) - x = self.dropout(x) - if self.use_lenhead: - return x, len_x - return x diff --git a/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/common.py b/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/common.py deleted file mode 100644 index ec1b30ccb0a04888562a0207bbdfbed1d8da0add..0000000000000000000000000000000000000000 --- a/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/common.py +++ /dev/null @@ -1,76 +0,0 @@ -import torch -import torch.nn.functional as F -from torch import nn - - -class Hswish(nn.Module): - def __init__(self, inplace=True): - super(Hswish, self).__init__() - self.inplace = inplace - - def forward(self, x): - return x * F.relu6(x + 3.0, inplace=self.inplace) / 6.0 - - -# out = max(0, min(1, slop*x+offset)) -# paddle.fluid.layers.hard_sigmoid(x, slope=0.2, offset=0.5, name=None) -class Hsigmoid(nn.Module): - def __init__(self, inplace=True): - super(Hsigmoid, self).__init__() - self.inplace = inplace - - def forward(self, x): - # torch: F.relu6(x + 3., inplace=self.inplace) / 6. - # paddle: F.relu6(1.2 * x + 3., inplace=self.inplace) / 6. - return F.relu6(1.2 * x + 3.0, inplace=self.inplace) / 6.0 - - -class GELU(nn.Module): - def __init__(self, inplace=True): - super(GELU, self).__init__() - self.inplace = inplace - - def forward(self, x): - return torch.nn.functional.gelu(x) - - -class Swish(nn.Module): - def __init__(self, inplace=True): - super(Swish, self).__init__() - self.inplace = inplace - - def forward(self, x): - if self.inplace: - x.mul_(torch.sigmoid(x)) - return x - else: - return x * torch.sigmoid(x) - - -class Activation(nn.Module): - def __init__(self, act_type, inplace=True): - super(Activation, self).__init__() - act_type = act_type.lower() - if act_type == "relu": - self.act = nn.ReLU(inplace=inplace) - elif act_type == "relu6": - self.act = nn.ReLU6(inplace=inplace) - elif act_type == "sigmoid": - raise NotImplementedError - elif act_type == "hard_sigmoid": - self.act = Hsigmoid( - inplace - ) # nn.Hardsigmoid(inplace=inplace)#Hsigmoid(inplace)# - elif act_type == "hard_swish" or act_type == "hswish": - self.act = Hswish(inplace=inplace) - elif act_type == "leakyrelu": - self.act = nn.LeakyReLU(inplace=inplace) - elif act_type == "gelu": - self.act = GELU(inplace=inplace) - elif act_type == "swish": - self.act = Swish(inplace=inplace) - else: - raise NotImplementedError - - def forward(self, inputs): - return self.act(inputs) diff --git a/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/heads/__init__.py b/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/heads/__init__.py deleted file mode 100644 index 00428c4374f8d69f8b59b40406bbb56cdf904dd3..0000000000000000000000000000000000000000 --- a/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/heads/__init__.py +++ /dev/null @@ -1,43 +0,0 @@ -# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -__all__ = ["build_head"] - - -def build_head(config, **kwargs): - # det head - from .det_db_head import DBHead, PFHeadLocal - - # rec head - from .rec_ctc_head import CTCHead - from .rec_multi_head import MultiHead - - # cls head - from .cls_head import ClsHead - - support_dict = [ - "DBHead", - "CTCHead", - "ClsHead", - "MultiHead", - "PFHeadLocal", - ] - - module_name = config.pop("name") - char_num = config.pop("char_num", 6625) - assert module_name in support_dict, Exception( - "head only support {}".format(support_dict) - ) - module_class = eval(module_name)(**config, **kwargs) - return module_class diff --git a/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/heads/cls_head.py b/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/heads/cls_head.py deleted file mode 100644 index 9353b9ebb88c043ab31eedb4219b191eb88417da..0000000000000000000000000000000000000000 --- a/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/heads/cls_head.py +++ /dev/null @@ -1,23 +0,0 @@ -import torch -import torch.nn.functional as F -from torch import nn - - -class ClsHead(nn.Module): - """ - Class orientation - Args: - params(dict): super parameters for build Class network - """ - - def __init__(self, in_channels, class_dim, **kwargs): - super(ClsHead, self).__init__() - self.pool = nn.AdaptiveAvgPool2d(1) - self.fc = nn.Linear(in_channels, class_dim, bias=True) - - def forward(self, x): - x = self.pool(x) - x = torch.reshape(x, shape=[x.shape[0], x.shape[1]]) - x = self.fc(x) - x = F.softmax(x, dim=1) - return x diff --git a/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/heads/det_db_head.py b/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/heads/det_db_head.py deleted file mode 100644 index 7c1196830829e6c788e5864861471977cdb47e25..0000000000000000000000000000000000000000 --- a/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/heads/det_db_head.py +++ /dev/null @@ -1,109 +0,0 @@ -import torch -import torch.nn as nn -import torch.nn.functional as F -from ..common import Activation -from ..backbones.det_mobilenet_v3 import ConvBNLayer - -class Head(nn.Module): - def __init__(self, in_channels, **kwargs): - super(Head, self).__init__() - self.conv1 = nn.Conv2d( - in_channels=in_channels, - out_channels=in_channels // 4, - kernel_size=3, - padding=1, - bias=False) - self.conv_bn1 = nn.BatchNorm2d( - in_channels // 4) - self.relu1 = Activation(act_type='relu') - - self.conv2 = nn.ConvTranspose2d( - in_channels=in_channels // 4, - out_channels=in_channels // 4, - kernel_size=2, - stride=2) - self.conv_bn2 = nn.BatchNorm2d( - in_channels // 4) - self.relu2 = Activation(act_type='relu') - - self.conv3 = nn.ConvTranspose2d( - in_channels=in_channels // 4, - out_channels=1, - kernel_size=2, - stride=2) - - def forward(self, x, return_f=False): - x = self.conv1(x) - x = self.conv_bn1(x) - x = self.relu1(x) - x = self.conv2(x) - x = self.conv_bn2(x) - x = self.relu2(x) - if return_f is True: - f = x - x = self.conv3(x) - x = torch.sigmoid(x) - if return_f is True: - return x, f - return x - - -class DBHead(nn.Module): - """ - Differentiable Binarization (DB) for text detection: - see https://arxiv.org/abs/1911.08947 - args: - params(dict): super parameters for build DB network - """ - - def __init__(self, in_channels, k=50, **kwargs): - super(DBHead, self).__init__() - self.k = k - binarize_name_list = [ - 'conv2d_56', 'batch_norm_47', 'conv2d_transpose_0', 'batch_norm_48', - 'conv2d_transpose_1', 'binarize' - ] - thresh_name_list = [ - 'conv2d_57', 'batch_norm_49', 'conv2d_transpose_2', 'batch_norm_50', - 'conv2d_transpose_3', 'thresh' - ] - self.binarize = Head(in_channels, **kwargs)# binarize_name_list) - self.thresh = Head(in_channels, **kwargs)#thresh_name_list) - - def step_function(self, x, y): - return torch.reciprocal(1 + torch.exp(-self.k * (x - y))) - - def forward(self, x): - shrink_maps = self.binarize(x) - return {'maps': shrink_maps} - - -class LocalModule(nn.Module): - def __init__(self, in_c, mid_c, use_distance=True): - super(self.__class__, self).__init__() - self.last_3 = ConvBNLayer(in_c + 1, mid_c, 3, 1, 1, act='relu') - self.last_1 = nn.Conv2d(mid_c, 1, 1, 1, 0) - - def forward(self, x, init_map, distance_map): - outf = torch.cat([init_map, x], dim=1) - # last Conv - out = self.last_1(self.last_3(outf)) - return out - -class PFHeadLocal(DBHead): - def __init__(self, in_channels, k=50, mode='small', **kwargs): - super(PFHeadLocal, self).__init__(in_channels, k, **kwargs) - self.mode = mode - - self.up_conv = nn.Upsample(scale_factor=2, mode="nearest") - if self.mode == 'large': - self.cbn_layer = LocalModule(in_channels // 4, in_channels // 4) - elif self.mode == 'small': - self.cbn_layer = LocalModule(in_channels // 4, in_channels // 8) - - def forward(self, x, targets=None): - shrink_maps, f = self.binarize(x, return_f=True) - base_maps = shrink_maps - cbn_maps = self.cbn_layer(self.up_conv(f), shrink_maps, None) - cbn_maps = F.sigmoid(cbn_maps) - return {'maps': 0.5 * (base_maps + cbn_maps), 'cbn_maps': cbn_maps} \ No newline at end of file diff --git a/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/heads/rec_ctc_head.py b/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/heads/rec_ctc_head.py deleted file mode 100644 index 42e2fabba48ce813b7736b2242eb117761a242bc..0000000000000000000000000000000000000000 --- a/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/heads/rec_ctc_head.py +++ /dev/null @@ -1,54 +0,0 @@ -import torch.nn.functional as F -from torch import nn - - -class CTCHead(nn.Module): - def __init__( - self, - in_channels, - out_channels=6625, - fc_decay=0.0004, - mid_channels=None, - return_feats=False, - **kwargs - ): - super(CTCHead, self).__init__() - if mid_channels is None: - self.fc = nn.Linear( - in_channels, - out_channels, - bias=True, - ) - else: - self.fc1 = nn.Linear( - in_channels, - mid_channels, - bias=True, - ) - self.fc2 = nn.Linear( - mid_channels, - out_channels, - bias=True, - ) - - self.out_channels = out_channels - self.mid_channels = mid_channels - self.return_feats = return_feats - - def forward(self, x, labels=None): - if self.mid_channels is None: - predicts = self.fc(x) - else: - x = self.fc1(x) - predicts = self.fc2(x) - - if self.return_feats: - result = (x, predicts) - else: - result = predicts - - if not self.training: - predicts = F.softmax(predicts, dim=2) - result = predicts - - return result diff --git a/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/heads/rec_multi_head.py b/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/heads/rec_multi_head.py deleted file mode 100644 index a4807cbb0cde37024fac62a39c8dee7f75d6da1f..0000000000000000000000000000000000000000 --- a/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/heads/rec_multi_head.py +++ /dev/null @@ -1,58 +0,0 @@ -from torch import nn - -from ..necks.rnn import Im2Seq, SequenceEncoder -from .rec_ctc_head import CTCHead - - -class FCTranspose(nn.Module): - def __init__(self, in_channels, out_channels, only_transpose=False): - super().__init__() - self.only_transpose = only_transpose - if not self.only_transpose: - self.fc = nn.Linear(in_channels, out_channels, bias=False) - - def forward(self, x): - if self.only_transpose: - return x.permute([0, 2, 1]) - else: - return self.fc(x.permute([0, 2, 1])) - - -class MultiHead(nn.Module): - def __init__(self, in_channels, out_channels_list, **kwargs): - super().__init__() - self.head_list = kwargs.pop("head_list") - - self.gtc_head = "sar" - assert len(self.head_list) >= 2 - for idx, head_name in enumerate(self.head_list): - name = list(head_name)[0] - if name == "SARHead": - pass - - elif name == "NRTRHead": - pass - elif name == "CTCHead": - # ctc neck - self.encoder_reshape = Im2Seq(in_channels) - neck_args = self.head_list[idx][name]["Neck"] - encoder_type = neck_args.pop("name") - self.ctc_encoder = SequenceEncoder( - in_channels=in_channels, encoder_type=encoder_type, **neck_args - ) - # ctc head - head_args = self.head_list[idx][name].get("Head", {}) - if head_args is None: - head_args = {} - - self.ctc_head = CTCHead( - in_channels=self.ctc_encoder.out_channels, - out_channels=out_channels_list["CTCLabelDecode"], - **head_args, - ) - else: - raise NotImplementedError(f"{name} is not supported in MultiHead yet") - - def forward(self, x, data=None): - ctc_encoder = self.ctc_encoder(x) - return self.ctc_head(ctc_encoder) diff --git a/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/necks/__init__.py b/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/necks/__init__.py deleted file mode 100644 index bbe85bc6a59f8d03541cbeb0e7cff34c5ba6c2e5..0000000000000000000000000000000000000000 --- a/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/necks/__init__.py +++ /dev/null @@ -1,29 +0,0 @@ -# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -__all__ = ["build_neck"] - - -def build_neck(config): - from .db_fpn import DBFPN, LKPAN, RSEFPN - from .rnn import SequenceEncoder - - support_dict = ["DBFPN", "SequenceEncoder", "RSEFPN", "LKPAN"] - - module_name = config.pop("name") - assert module_name in support_dict, Exception( - "neck only support {}".format(support_dict) - ) - module_class = eval(module_name)(**config) - return module_class diff --git a/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/necks/db_fpn.py b/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/necks/db_fpn.py deleted file mode 100644 index 9c8460a23a5816ba9ff8c6be6ed8fd31e4e697b2..0000000000000000000000000000000000000000 --- a/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/necks/db_fpn.py +++ /dev/null @@ -1,456 +0,0 @@ -import torch -import torch.nn.functional as F -from torch import nn - -from ..backbones.det_mobilenet_v3 import SEModule -from ..necks.intracl import IntraCLBlock - - -def hard_swish(x, inplace=True): - return x * F.relu6(x + 3.0, inplace=inplace) / 6.0 - - -class DSConv(nn.Module): - def __init__( - self, - in_channels, - out_channels, - kernel_size, - padding, - stride=1, - groups=None, - if_act=True, - act="relu", - **kwargs - ): - super(DSConv, self).__init__() - if groups == None: - groups = in_channels - self.if_act = if_act - self.act = act - self.conv1 = nn.Conv2d( - in_channels=in_channels, - out_channels=in_channels, - kernel_size=kernel_size, - stride=stride, - padding=padding, - groups=groups, - bias=False, - ) - - self.bn1 = nn.BatchNorm2d(in_channels) - - self.conv2 = nn.Conv2d( - in_channels=in_channels, - out_channels=int(in_channels * 4), - kernel_size=1, - stride=1, - bias=False, - ) - - self.bn2 = nn.BatchNorm2d(int(in_channels * 4)) - - self.conv3 = nn.Conv2d( - in_channels=int(in_channels * 4), - out_channels=out_channels, - kernel_size=1, - stride=1, - bias=False, - ) - self._c = [in_channels, out_channels] - if in_channels != out_channels: - self.conv_end = nn.Conv2d( - in_channels=in_channels, - out_channels=out_channels, - kernel_size=1, - stride=1, - bias=False, - ) - - def forward(self, inputs): - x = self.conv1(inputs) - x = self.bn1(x) - - x = self.conv2(x) - x = self.bn2(x) - if self.if_act: - if self.act == "relu": - x = F.relu(x) - elif self.act == "hardswish": - x = hard_swish(x) - else: - print( - "The activation function({}) is selected incorrectly.".format( - self.act - ) - ) - exit() - - x = self.conv3(x) - if self._c[0] != self._c[1]: - x = x + self.conv_end(inputs) - return x - - -class DBFPN(nn.Module): - def __init__(self, in_channels, out_channels, use_asf=False, **kwargs): - super(DBFPN, self).__init__() - self.out_channels = out_channels - self.use_asf = use_asf - - self.in2_conv = nn.Conv2d( - in_channels=in_channels[0], - out_channels=self.out_channels, - kernel_size=1, - bias=False, - ) - self.in3_conv = nn.Conv2d( - in_channels=in_channels[1], - out_channels=self.out_channels, - kernel_size=1, - bias=False, - ) - self.in4_conv = nn.Conv2d( - in_channels=in_channels[2], - out_channels=self.out_channels, - kernel_size=1, - bias=False, - ) - self.in5_conv = nn.Conv2d( - in_channels=in_channels[3], - out_channels=self.out_channels, - kernel_size=1, - bias=False, - ) - self.p5_conv = nn.Conv2d( - in_channels=self.out_channels, - out_channels=self.out_channels // 4, - kernel_size=3, - padding=1, - bias=False, - ) - self.p4_conv = nn.Conv2d( - in_channels=self.out_channels, - out_channels=self.out_channels // 4, - kernel_size=3, - padding=1, - bias=False, - ) - self.p3_conv = nn.Conv2d( - in_channels=self.out_channels, - out_channels=self.out_channels // 4, - kernel_size=3, - padding=1, - bias=False, - ) - self.p2_conv = nn.Conv2d( - in_channels=self.out_channels, - out_channels=self.out_channels // 4, - kernel_size=3, - padding=1, - bias=False, - ) - - if self.use_asf is True: - self.asf = ASFBlock(self.out_channels, self.out_channels // 4) - - def forward(self, x): - c2, c3, c4, c5 = x - - in5 = self.in5_conv(c5) - in4 = self.in4_conv(c4) - in3 = self.in3_conv(c3) - in2 = self.in2_conv(c2) - - out4 = in4 + F.interpolate( - in5, - scale_factor=2, - mode="nearest", - ) # align_mode=1) # 1/16 - out3 = in3 + F.interpolate( - out4, - scale_factor=2, - mode="nearest", - ) # align_mode=1) # 1/8 - out2 = in2 + F.interpolate( - out3, - scale_factor=2, - mode="nearest", - ) # align_mode=1) # 1/4 - - p5 = self.p5_conv(in5) - p4 = self.p4_conv(out4) - p3 = self.p3_conv(out3) - p2 = self.p2_conv(out2) - p5 = F.interpolate( - p5, - scale_factor=8, - mode="nearest", - ) # align_mode=1) - p4 = F.interpolate( - p4, - scale_factor=4, - mode="nearest", - ) # align_mode=1) - p3 = F.interpolate( - p3, - scale_factor=2, - mode="nearest", - ) # align_mode=1) - - fuse = torch.cat([p5, p4, p3, p2], dim=1) - - if self.use_asf is True: - fuse = self.asf(fuse, [p5, p4, p3, p2]) - - return fuse - - -class RSELayer(nn.Module): - def __init__(self, in_channels, out_channels, kernel_size, shortcut=True): - super(RSELayer, self).__init__() - self.out_channels = out_channels - self.in_conv = nn.Conv2d( - in_channels=in_channels, - out_channels=self.out_channels, - kernel_size=kernel_size, - padding=int(kernel_size // 2), - bias=False, - ) - self.se_block = SEModule(self.out_channels) - self.shortcut = shortcut - - def forward(self, ins): - x = self.in_conv(ins) - if self.shortcut: - out = x + self.se_block(x) - else: - out = self.se_block(x) - return out - - -class RSEFPN(nn.Module): - def __init__(self, in_channels, out_channels, shortcut=True, **kwargs): - super(RSEFPN, self).__init__() - self.out_channels = out_channels - self.ins_conv = nn.ModuleList() - self.inp_conv = nn.ModuleList() - self.intracl = False - if "intracl" in kwargs.keys() and kwargs["intracl"] is True: - self.intracl = kwargs["intracl"] - self.incl1 = IntraCLBlock(self.out_channels // 4, reduce_factor=2) - self.incl2 = IntraCLBlock(self.out_channels // 4, reduce_factor=2) - self.incl3 = IntraCLBlock(self.out_channels // 4, reduce_factor=2) - self.incl4 = IntraCLBlock(self.out_channels // 4, reduce_factor=2) - - for i in range(len(in_channels)): - self.ins_conv.append( - RSELayer(in_channels[i], out_channels, kernel_size=1, shortcut=shortcut) - ) - self.inp_conv.append( - RSELayer( - out_channels, out_channels // 4, kernel_size=3, shortcut=shortcut - ) - ) - - def forward(self, x): - c2, c3, c4, c5 = x - - in5 = self.ins_conv[3](c5) - in4 = self.ins_conv[2](c4) - in3 = self.ins_conv[1](c3) - in2 = self.ins_conv[0](c2) - - out4 = in4 + F.interpolate(in5, scale_factor=2, mode="nearest") # 1/16 - out3 = in3 + F.interpolate(out4, scale_factor=2, mode="nearest") # 1/8 - out2 = in2 + F.interpolate(out3, scale_factor=2, mode="nearest") # 1/4 - - p5 = self.inp_conv[3](in5) - p4 = self.inp_conv[2](out4) - p3 = self.inp_conv[1](out3) - p2 = self.inp_conv[0](out2) - - if self.intracl is True: - p5 = self.incl4(p5) - p4 = self.incl3(p4) - p3 = self.incl2(p3) - p2 = self.incl1(p2) - - p5 = F.interpolate(p5, scale_factor=8, mode="nearest") - p4 = F.interpolate(p4, scale_factor=4, mode="nearest") - p3 = F.interpolate(p3, scale_factor=2, mode="nearest") - - fuse = torch.cat([p5, p4, p3, p2], dim=1) - return fuse - - -class LKPAN(nn.Module): - def __init__(self, in_channels, out_channels, mode="large", **kwargs): - super(LKPAN, self).__init__() - self.out_channels = out_channels - - self.ins_conv = nn.ModuleList() - self.inp_conv = nn.ModuleList() - # pan head - self.pan_head_conv = nn.ModuleList() - self.pan_lat_conv = nn.ModuleList() - - if mode.lower() == "lite": - p_layer = DSConv - elif mode.lower() == "large": - p_layer = nn.Conv2d - else: - raise ValueError( - "mode can only be one of ['lite', 'large'], but received {}".format( - mode - ) - ) - - for i in range(len(in_channels)): - self.ins_conv.append( - nn.Conv2d( - in_channels=in_channels[i], - out_channels=self.out_channels, - kernel_size=1, - bias=False, - ) - ) - - self.inp_conv.append( - p_layer( - in_channels=self.out_channels, - out_channels=self.out_channels // 4, - kernel_size=9, - padding=4, - bias=False, - ) - ) - - if i > 0: - self.pan_head_conv.append( - nn.Conv2d( - in_channels=self.out_channels // 4, - out_channels=self.out_channels // 4, - kernel_size=3, - padding=1, - stride=2, - bias=False, - ) - ) - self.pan_lat_conv.append( - p_layer( - in_channels=self.out_channels // 4, - out_channels=self.out_channels // 4, - kernel_size=9, - padding=4, - bias=False, - ) - ) - self.intracl = False - if "intracl" in kwargs.keys() and kwargs["intracl"] is True: - self.intracl = kwargs["intracl"] - self.incl1 = IntraCLBlock(self.out_channels // 4, reduce_factor=2) - self.incl2 = IntraCLBlock(self.out_channels // 4, reduce_factor=2) - self.incl3 = IntraCLBlock(self.out_channels // 4, reduce_factor=2) - self.incl4 = IntraCLBlock(self.out_channels // 4, reduce_factor=2) - - def forward(self, x): - c2, c3, c4, c5 = x - - in5 = self.ins_conv[3](c5) - in4 = self.ins_conv[2](c4) - in3 = self.ins_conv[1](c3) - in2 = self.ins_conv[0](c2) - - out4 = in4 + F.interpolate(in5, scale_factor=2, mode="nearest") # 1/16 - out3 = in3 + F.interpolate(out4, scale_factor=2, mode="nearest") # 1/8 - out2 = in2 + F.interpolate(out3, scale_factor=2, mode="nearest") # 1/4 - - f5 = self.inp_conv[3](in5) - f4 = self.inp_conv[2](out4) - f3 = self.inp_conv[1](out3) - f2 = self.inp_conv[0](out2) - - pan3 = f3 + self.pan_head_conv[0](f2) - pan4 = f4 + self.pan_head_conv[1](pan3) - pan5 = f5 + self.pan_head_conv[2](pan4) - - p2 = self.pan_lat_conv[0](f2) - p3 = self.pan_lat_conv[1](pan3) - p4 = self.pan_lat_conv[2](pan4) - p5 = self.pan_lat_conv[3](pan5) - - if self.intracl is True: - p5 = self.incl4(p5) - p4 = self.incl3(p4) - p3 = self.incl2(p3) - p2 = self.incl1(p2) - - p5 = F.interpolate(p5, scale_factor=8, mode="nearest") - p4 = F.interpolate(p4, scale_factor=4, mode="nearest") - p3 = F.interpolate(p3, scale_factor=2, mode="nearest") - - fuse = torch.cat([p5, p4, p3, p2], dim=1) - return fuse - - -class ASFBlock(nn.Module): - """ - This code is refered from: - https://github.com/MhLiao/DB/blob/master/decoders/feature_attention.py - """ - - def __init__(self, in_channels, inter_channels, out_features_num=4): - """ - Adaptive Scale Fusion (ASF) block of DBNet++ - Args: - in_channels: the number of channels in the input data - inter_channels: the number of middle channels - out_features_num: the number of fused stages - """ - super(ASFBlock, self).__init__() - self.in_channels = in_channels - self.inter_channels = inter_channels - self.out_features_num = out_features_num - self.conv = nn.Conv2d(in_channels, inter_channels, 3, padding=1) - - self.spatial_scale = nn.Sequential( - # Nx1xHxW - nn.Conv2d( - in_channels=1, - out_channels=1, - kernel_size=3, - bias=False, - padding=1, - ), - nn.ReLU(), - nn.Conv2d( - in_channels=1, - out_channels=1, - kernel_size=1, - bias=False, - ), - nn.Sigmoid(), - ) - - self.channel_scale = nn.Sequential( - nn.Conv2d( - in_channels=inter_channels, - out_channels=out_features_num, - kernel_size=1, - bias=False, - ), - nn.Sigmoid(), - ) - - def forward(self, fuse_features, features_list): - fuse_features = self.conv(fuse_features) - spatial_x = torch.mean(fuse_features, dim=1, keepdim=True) - attention_scores = self.spatial_scale(spatial_x) + fuse_features - attention_scores = self.channel_scale(attention_scores) - assert len(features_list) == self.out_features_num - - out_list = [] - for i in range(self.out_features_num): - out_list.append(attention_scores[:, i : i + 1] * features_list[i]) - return torch.cat(out_list, dim=1) diff --git a/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/necks/intracl.py b/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/necks/intracl.py deleted file mode 100644 index 0ba85fa8086ff013491ef66beca49e0ee8475f2c..0000000000000000000000000000000000000000 --- a/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/necks/intracl.py +++ /dev/null @@ -1,117 +0,0 @@ -from torch import nn - - -class IntraCLBlock(nn.Module): - def __init__(self, in_channels=96, reduce_factor=4): - super(IntraCLBlock, self).__init__() - self.channels = in_channels - self.rf = reduce_factor - self.conv1x1_reduce_channel = nn.Conv2d( - self.channels, self.channels // self.rf, kernel_size=1, stride=1, padding=0 - ) - self.conv1x1_return_channel = nn.Conv2d( - self.channels // self.rf, self.channels, kernel_size=1, stride=1, padding=0 - ) - - self.v_layer_7x1 = nn.Conv2d( - self.channels // self.rf, - self.channels // self.rf, - kernel_size=(7, 1), - stride=(1, 1), - padding=(3, 0), - ) - self.v_layer_5x1 = nn.Conv2d( - self.channels // self.rf, - self.channels // self.rf, - kernel_size=(5, 1), - stride=(1, 1), - padding=(2, 0), - ) - self.v_layer_3x1 = nn.Conv2d( - self.channels // self.rf, - self.channels // self.rf, - kernel_size=(3, 1), - stride=(1, 1), - padding=(1, 0), - ) - - self.q_layer_1x7 = nn.Conv2d( - self.channels // self.rf, - self.channels // self.rf, - kernel_size=(1, 7), - stride=(1, 1), - padding=(0, 3), - ) - self.q_layer_1x5 = nn.Conv2d( - self.channels // self.rf, - self.channels // self.rf, - kernel_size=(1, 5), - stride=(1, 1), - padding=(0, 2), - ) - self.q_layer_1x3 = nn.Conv2d( - self.channels // self.rf, - self.channels // self.rf, - kernel_size=(1, 3), - stride=(1, 1), - padding=(0, 1), - ) - - # base - self.c_layer_7x7 = nn.Conv2d( - self.channels // self.rf, - self.channels // self.rf, - kernel_size=(7, 7), - stride=(1, 1), - padding=(3, 3), - ) - self.c_layer_5x5 = nn.Conv2d( - self.channels // self.rf, - self.channels // self.rf, - kernel_size=(5, 5), - stride=(1, 1), - padding=(2, 2), - ) - self.c_layer_3x3 = nn.Conv2d( - self.channels // self.rf, - self.channels // self.rf, - kernel_size=(3, 3), - stride=(1, 1), - padding=(1, 1), - ) - - self.bn = nn.BatchNorm2d(self.channels) - self.relu = nn.ReLU() - - def forward(self, x): - x_new = self.conv1x1_reduce_channel(x) - - x_7_c = self.c_layer_7x7(x_new) - x_7_v = self.v_layer_7x1(x_new) - x_7_q = self.q_layer_1x7(x_new) - x_7 = x_7_c + x_7_v + x_7_q - - x_5_c = self.c_layer_5x5(x_7) - x_5_v = self.v_layer_5x1(x_7) - x_5_q = self.q_layer_1x5(x_7) - x_5 = x_5_c + x_5_v + x_5_q - - x_3_c = self.c_layer_3x3(x_5) - x_3_v = self.v_layer_3x1(x_5) - x_3_q = self.q_layer_1x3(x_5) - x_3 = x_3_c + x_3_v + x_3_q - - x_relation = self.conv1x1_return_channel(x_3) - - x_relation = self.bn(x_relation) - x_relation = self.relu(x_relation) - - return x + x_relation - - -def build_intraclblock_list(num_block): - IntraCLBlock_list = nn.ModuleList() - for i in range(num_block): - IntraCLBlock_list.append(IntraCLBlock()) - - return IntraCLBlock_list diff --git a/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/necks/rnn.py b/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/necks/rnn.py deleted file mode 100644 index 79c8af3028a02abbfbc31fec95d5088e3f59c506..0000000000000000000000000000000000000000 --- a/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/necks/rnn.py +++ /dev/null @@ -1,241 +0,0 @@ -import torch -from torch import nn - -from ..backbones.rec_svtrnet import Block, ConvBNLayer - - -class Im2Seq(nn.Module): - def __init__(self, in_channels, **kwargs): - super().__init__() - self.out_channels = in_channels - - # def forward(self, x): - # B, C, H, W = x.shape - # # assert H == 1 - # x = x.squeeze(dim=2) - # # x = x.transpose([0, 2, 1]) # paddle (NTC)(batch, width, channels) - # x = x.permute(0, 2, 1) - # return x - - def forward(self, x): - B, C, H, W = x.shape - # 处理四维张量,将空间维度展平为序列 - if H == 1: - # 原来的处理逻辑,适用于H=1的情况 - x = x.squeeze(dim=2) - x = x.permute(0, 2, 1) # (B, W, C) - else: - # 处理H不为1的情况 - x = x.permute(0, 2, 3, 1) # (B, H, W, C) - x = x.reshape(B, H * W, C) # (B, H*W, C) - - return x - -class EncoderWithRNN_(nn.Module): - def __init__(self, in_channels, hidden_size): - super(EncoderWithRNN_, self).__init__() - self.out_channels = hidden_size * 2 - self.rnn1 = nn.LSTM( - in_channels, - hidden_size, - bidirectional=False, - batch_first=True, - num_layers=2, - ) - self.rnn2 = nn.LSTM( - in_channels, - hidden_size, - bidirectional=False, - batch_first=True, - num_layers=2, - ) - - def forward(self, x): - self.rnn1.flatten_parameters() - self.rnn2.flatten_parameters() - out1, h1 = self.rnn1(x) - out2, h2 = self.rnn2(torch.flip(x, [1])) - return torch.cat([out1, torch.flip(out2, [1])], 2) - - -class EncoderWithRNN(nn.Module): - def __init__(self, in_channels, hidden_size): - super(EncoderWithRNN, self).__init__() - self.out_channels = hidden_size * 2 - self.lstm = nn.LSTM( - in_channels, hidden_size, num_layers=2, batch_first=True, bidirectional=True - ) # batch_first:=True - - def forward(self, x): - x, _ = self.lstm(x) - return x - - -class EncoderWithFC(nn.Module): - def __init__(self, in_channels, hidden_size): - super(EncoderWithFC, self).__init__() - self.out_channels = hidden_size - self.fc = nn.Linear( - in_channels, - hidden_size, - bias=True, - ) - - def forward(self, x): - x = self.fc(x) - return x - - -class EncoderWithSVTR(nn.Module): - def __init__( - self, - in_channels, - dims=64, # XS - depth=2, - hidden_dims=120, - use_guide=False, - num_heads=8, - qkv_bias=True, - mlp_ratio=2.0, - drop_rate=0.1, - kernel_size=[3, 3], - attn_drop_rate=0.1, - drop_path=0.0, - qk_scale=None, - ): - super(EncoderWithSVTR, self).__init__() - self.depth = depth - self.use_guide = use_guide - self.conv1 = ConvBNLayer( - in_channels, - in_channels // 8, - kernel_size=kernel_size, - padding=[kernel_size[0] // 2, kernel_size[1] // 2], - act="swish", - ) - self.conv2 = ConvBNLayer( - in_channels // 8, hidden_dims, kernel_size=1, act="swish" - ) - - self.svtr_block = nn.ModuleList( - [ - Block( - dim=hidden_dims, - num_heads=num_heads, - mixer="Global", - HW=None, - mlp_ratio=mlp_ratio, - qkv_bias=qkv_bias, - qk_scale=qk_scale, - drop=drop_rate, - act_layer="swish", - attn_drop=attn_drop_rate, - drop_path=drop_path, - norm_layer="nn.LayerNorm", - epsilon=1e-05, - prenorm=False, - ) - for i in range(depth) - ] - ) - self.norm = nn.LayerNorm(hidden_dims, eps=1e-6) - self.conv3 = ConvBNLayer(hidden_dims, in_channels, kernel_size=1, act="swish") - # last conv-nxn, the input is concat of input tensor and conv3 output tensor - self.conv4 = ConvBNLayer( - 2 * in_channels, in_channels // 8, padding=1, act="swish" - ) - - self.conv1x1 = ConvBNLayer(in_channels // 8, dims, kernel_size=1, act="swish") - self.out_channels = dims - self.apply(self._init_weights) - - def _init_weights(self, m): - # weight initialization - if isinstance(m, nn.Conv2d): - nn.init.kaiming_normal_(m.weight, mode="fan_out") - if m.bias is not None: - nn.init.zeros_(m.bias) - elif isinstance(m, nn.BatchNorm2d): - nn.init.ones_(m.weight) - nn.init.zeros_(m.bias) - elif isinstance(m, nn.Linear): - nn.init.normal_(m.weight, 0, 0.01) - if m.bias is not None: - nn.init.zeros_(m.bias) - elif isinstance(m, nn.ConvTranspose2d): - nn.init.kaiming_normal_(m.weight, mode="fan_out") - if m.bias is not None: - nn.init.zeros_(m.bias) - elif isinstance(m, nn.LayerNorm): - nn.init.ones_(m.weight) - nn.init.zeros_(m.bias) - - def forward(self, x): - # for use guide - if self.use_guide: - z = x.clone() - z.stop_gradient = True - else: - z = x - # for short cut - h = z - # reduce dim - z = self.conv1(z) - z = self.conv2(z) - # SVTR global block - B, C, H, W = z.shape - z = z.flatten(2).permute(0, 2, 1) - - for blk in self.svtr_block: - z = blk(z) - - z = self.norm(z) - # last stage - z = z.reshape([-1, H, W, C]).permute(0, 3, 1, 2) - z = self.conv3(z) - z = torch.cat((h, z), dim=1) - z = self.conv1x1(self.conv4(z)) - - return z - - -class SequenceEncoder(nn.Module): - def __init__(self, in_channels, encoder_type, hidden_size=48, **kwargs): - super(SequenceEncoder, self).__init__() - self.encoder_reshape = Im2Seq(in_channels) - self.out_channels = self.encoder_reshape.out_channels - self.encoder_type = encoder_type - if encoder_type == "reshape": - self.only_reshape = True - else: - support_encoder_dict = { - "reshape": Im2Seq, - "fc": EncoderWithFC, - "rnn": EncoderWithRNN, - "svtr": EncoderWithSVTR, - } - assert encoder_type in support_encoder_dict, "{} must in {}".format( - encoder_type, support_encoder_dict.keys() - ) - - if encoder_type == "svtr": - self.encoder = support_encoder_dict[encoder_type]( - self.encoder_reshape.out_channels, **kwargs - ) - else: - self.encoder = support_encoder_dict[encoder_type]( - self.encoder_reshape.out_channels, hidden_size - ) - self.out_channels = self.encoder.out_channels - self.only_reshape = False - - def forward(self, x): - if self.encoder_type != "svtr": - x = self.encoder_reshape(x) - if not self.only_reshape: - x = self.encoder(x) - return x - else: - x = self.encoder(x) - x = self.encoder_reshape(x) - return x diff --git a/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/postprocess/__init__.py b/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/postprocess/__init__.py deleted file mode 100755 index 40603ade8895fb995e97310ff75e7e67696bd52b..0000000000000000000000000000000000000000 --- a/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/postprocess/__init__.py +++ /dev/null @@ -1,33 +0,0 @@ - -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals - -import copy - -__all__ = ['build_post_process'] - - -def build_post_process(config, global_config=None): - from .db_postprocess import DBPostProcess - from .rec_postprocess import CTCLabelDecode, AttnLabelDecode, SRNLabelDecode, TableLabelDecode, \ - NRTRLabelDecode, SARLabelDecode, ViTSTRLabelDecode, RFLLabelDecode - from .cls_postprocess import ClsPostProcess - from .rec_postprocess import CANLabelDecode - - support_dict = [ - 'DBPostProcess', 'CTCLabelDecode', - 'AttnLabelDecode', 'ClsPostProcess', 'SRNLabelDecode', - 'TableLabelDecode', 'NRTRLabelDecode', 'SARLabelDecode', - 'ViTSTRLabelDecode','CANLabelDecode', 'RFLLabelDecode' - ] - - config = copy.deepcopy(config) - module_name = config.pop('name') - if global_config is not None: - config.update(global_config) - assert module_name in support_dict, Exception( - 'post process only support {}, but got {}'.format(support_dict, module_name)) - module_class = eval(module_name)(**config) - return module_class \ No newline at end of file diff --git a/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/postprocess/cls_postprocess.py b/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/postprocess/cls_postprocess.py deleted file mode 100755 index c9c6affce380d827090faf67b0e63cde1cdd00fd..0000000000000000000000000000000000000000 --- a/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/postprocess/cls_postprocess.py +++ /dev/null @@ -1,20 +0,0 @@ -import torch - - -class ClsPostProcess(object): - """ Convert between text-label and text-index """ - - def __init__(self, label_list, **kwargs): - super(ClsPostProcess, self).__init__() - self.label_list = label_list - - def __call__(self, preds, label=None, *args, **kwargs): - if isinstance(preds, torch.Tensor): - preds = preds.cpu().numpy() - pred_idxs = preds.argmax(axis=1) - decode_out = [(self.label_list[idx], preds[i, idx]) - for i, idx in enumerate(pred_idxs)] - if label is None: - return decode_out - label = [(self.label_list[idx], 1.0) for idx in label] - return decode_out, label \ No newline at end of file diff --git a/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/postprocess/db_postprocess.py b/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/postprocess/db_postprocess.py deleted file mode 100755 index 309f7f3fe4bbaf3e9b7a472fba3c4dc0b91d202c..0000000000000000000000000000000000000000 --- a/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/postprocess/db_postprocess.py +++ /dev/null @@ -1,179 +0,0 @@ -""" -This code is refered from: -https://github.com/WenmuZhou/DBNet.pytorch/blob/master/post_processing/seg_detector_representer.py -""" -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -import numpy as np -import cv2 -import torch -from shapely.geometry import Polygon -import pyclipper - - -class DBPostProcess(object): - """ - The post process for Differentiable Binarization (DB). - """ - - def __init__(self, - thresh=0.3, - box_thresh=0.7, - max_candidates=1000, - unclip_ratio=2.0, - use_dilation=False, - score_mode="fast", - **kwargs): - self.thresh = thresh - self.box_thresh = box_thresh - self.max_candidates = max_candidates - self.unclip_ratio = unclip_ratio - self.min_size = 3 - self.score_mode = score_mode - assert score_mode in [ - "slow", "fast" - ], "Score mode must be in [slow, fast] but got: {}".format(score_mode) - - self.dilation_kernel = None if not use_dilation else np.array( - [[1, 1], [1, 1]]) - - def boxes_from_bitmap(self, pred, _bitmap, dest_width, dest_height): - ''' - _bitmap: single map with shape (1, H, W), - whose values are binarized as {0, 1} - ''' - - bitmap = _bitmap - height, width = bitmap.shape - - outs = cv2.findContours((bitmap * 255).astype(np.uint8), cv2.RETR_LIST, - cv2.CHAIN_APPROX_SIMPLE) - if len(outs) == 3: - img, contours, _ = outs[0], outs[1], outs[2] - elif len(outs) == 2: - contours, _ = outs[0], outs[1] - - num_contours = min(len(contours), self.max_candidates) - - boxes = [] - scores = [] - for index in range(num_contours): - contour = contours[index] - points, sside = self.get_mini_boxes(contour) - if sside < self.min_size: - continue - points = np.array(points) - if self.score_mode == "fast": - score = self.box_score_fast(pred, points.reshape(-1, 2)) - else: - score = self.box_score_slow(pred, contour) - if self.box_thresh > score: - continue - - box = self.unclip(points).reshape(-1, 1, 2) - box, sside = self.get_mini_boxes(box) - if sside < self.min_size + 2: - continue - box = np.array(box) - - box[:, 0] = np.clip( - np.round(box[:, 0] / width * dest_width), 0, dest_width) - box[:, 1] = np.clip( - np.round(box[:, 1] / height * dest_height), 0, dest_height) - boxes.append(box.astype(np.int16)) - scores.append(score) - return np.array(boxes, dtype=np.int16), scores - - def unclip(self, box): - unclip_ratio = self.unclip_ratio - poly = Polygon(box) - distance = poly.area * unclip_ratio / poly.length - offset = pyclipper.PyclipperOffset() - offset.AddPath(box, pyclipper.JT_ROUND, pyclipper.ET_CLOSEDPOLYGON) - expanded = np.array(offset.Execute(distance)) - return expanded - - def get_mini_boxes(self, contour): - bounding_box = cv2.minAreaRect(contour) - points = sorted(list(cv2.boxPoints(bounding_box)), key=lambda x: x[0]) - - index_1, index_2, index_3, index_4 = 0, 1, 2, 3 - if points[1][1] > points[0][1]: - index_1 = 0 - index_4 = 1 - else: - index_1 = 1 - index_4 = 0 - if points[3][1] > points[2][1]: - index_2 = 2 - index_3 = 3 - else: - index_2 = 3 - index_3 = 2 - - box = [ - points[index_1], points[index_2], points[index_3], points[index_4] - ] - return box, min(bounding_box[1]) - - def box_score_fast(self, bitmap, _box): - ''' - box_score_fast: use bbox mean score as the mean score - ''' - h, w = bitmap.shape[:2] - box = _box.copy() - xmin = np.clip(np.floor(box[:, 0].min()).astype(np.int64), 0, w - 1) - xmax = np.clip(np.ceil(box[:, 0].max()).astype(np.int64), 0, w - 1) - ymin = np.clip(np.floor(box[:, 1].min()).astype(np.int64), 0, h - 1) - ymax = np.clip(np.ceil(box[:, 1].max()).astype(np.int64), 0, h - 1) - - mask = np.zeros((ymax - ymin + 1, xmax - xmin + 1), dtype=np.uint8) - box[:, 0] = box[:, 0] - xmin - box[:, 1] = box[:, 1] - ymin - cv2.fillPoly(mask, box.reshape(1, -1, 2).astype(np.int32), 1) - return cv2.mean(bitmap[ymin:ymax + 1, xmin:xmax + 1], mask)[0] - - def box_score_slow(self, bitmap, contour): - ''' - box_score_slow: use polyon mean score as the mean score - ''' - h, w = bitmap.shape[:2] - contour = contour.copy() - contour = np.reshape(contour, (-1, 2)) - - xmin = np.clip(np.min(contour[:, 0]), 0, w - 1) - xmax = np.clip(np.max(contour[:, 0]), 0, w - 1) - ymin = np.clip(np.min(contour[:, 1]), 0, h - 1) - ymax = np.clip(np.max(contour[:, 1]), 0, h - 1) - - mask = np.zeros((ymax - ymin + 1, xmax - xmin + 1), dtype=np.uint8) - - contour[:, 0] = contour[:, 0] - xmin - contour[:, 1] = contour[:, 1] - ymin - - cv2.fillPoly(mask, contour.reshape(1, -1, 2).astype(np.int32), 1) - return cv2.mean(bitmap[ymin:ymax + 1, xmin:xmax + 1], mask)[0] - - def __call__(self, outs_dict, shape_list): - pred = outs_dict['maps'] - if isinstance(pred, torch.Tensor): - pred = pred.cpu().numpy() - pred = pred[:, 0, :, :] - segmentation = pred > self.thresh - - boxes_batch = [] - for batch_index in range(pred.shape[0]): - src_h, src_w, ratio_h, ratio_w = shape_list[batch_index] - if self.dilation_kernel is not None: - mask = cv2.dilate( - np.array(segmentation[batch_index]).astype(np.uint8), - self.dilation_kernel) - else: - mask = segmentation[batch_index] - boxes, scores = self.boxes_from_bitmap(pred[batch_index], mask, - src_w, src_h) - - boxes_batch.append({'points': boxes}) - return boxes_batch \ No newline at end of file diff --git a/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/postprocess/rec_postprocess.py b/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/postprocess/rec_postprocess.py deleted file mode 100755 index c83fe5c33dbee9be142880aa088f054131dac042..0000000000000000000000000000000000000000 --- a/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/postprocess/rec_postprocess.py +++ /dev/null @@ -1,690 +0,0 @@ -# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import numpy as np -import torch - - -class BaseRecLabelDecode(object): - """ Convert between text-label and text-index """ - - def __init__(self, - character_dict_path=None, - use_space_char=False): - - self.beg_str = "sos" - self.end_str = "eos" - - self.character_str = [] - if character_dict_path is None: - self.character_str = "0123456789abcdefghijklmnopqrstuvwxyz" - dict_character = list(self.character_str) - else: - with open(character_dict_path, "rb") as fin: - lines = fin.readlines() - for line in lines: - line = line.decode('utf-8').strip("\n").strip("\r\n") - self.character_str.append(line) - if use_space_char: - self.character_str.append(" ") - dict_character = list(self.character_str) - - dict_character = self.add_special_char(dict_character) - self.dict = {} - for i, char in enumerate(dict_character): - self.dict[char] = i - self.character = dict_character - - def add_special_char(self, dict_character): - return dict_character - - def decode(self, text_index, text_prob=None, is_remove_duplicate=False): - """ convert text-index into text-label. """ - result_list = [] - ignored_tokens = self.get_ignored_tokens() - batch_size = len(text_index) - for batch_idx in range(batch_size): - char_list = [] - conf_list = [] - for idx in range(len(text_index[batch_idx])): - if text_index[batch_idx][idx] in ignored_tokens: - continue - if is_remove_duplicate: - # only for predict - if idx > 0 and text_index[batch_idx][idx - 1] == text_index[ - batch_idx][idx]: - continue - char_list.append(self.character[int(text_index[batch_idx][ - idx])]) - if text_prob is not None: - conf_list.append(text_prob[batch_idx][idx]) - else: - conf_list.append(1) - text = ''.join(char_list) - result_list.append((text, np.mean(conf_list))) - return result_list - - def get_ignored_tokens(self): - return [0] # for ctc blank - - -class CTCLabelDecode(BaseRecLabelDecode): - """ Convert between text-label and text-index """ - - def __init__(self, - character_dict_path=None, - use_space_char=False, - **kwargs): - super(CTCLabelDecode, self).__init__(character_dict_path, - use_space_char) - - def __call__(self, preds, label=None, *args, **kwargs): - if isinstance(preds, torch.Tensor): - preds = preds.numpy() - preds_idx = preds.argmax(axis=2) - preds_prob = preds.max(axis=2) - text = self.decode(preds_idx, preds_prob, is_remove_duplicate=True) - - if label is None: - return text - label = self.decode(label) - return text, label - - def add_special_char(self, dict_character): - dict_character = ['blank'] + dict_character - return dict_character - - -class NRTRLabelDecode(BaseRecLabelDecode): - """ Convert between text-label and text-index """ - - def __init__(self, character_dict_path=None, use_space_char=True, **kwargs): - super(NRTRLabelDecode, self).__init__(character_dict_path, - use_space_char) - - def __call__(self, preds, label=None, *args, **kwargs): - - if len(preds) == 2: - preds_id = preds[0] - preds_prob = preds[1] - if isinstance(preds_id, torch.Tensor): - preds_id = preds_id.numpy() - if isinstance(preds_prob, torch.Tensor): - preds_prob = preds_prob.numpy() - if preds_id[0][0] == 2: - preds_idx = preds_id[:, 1:] - preds_prob = preds_prob[:, 1:] - else: - preds_idx = preds_id - text = self.decode(preds_idx, preds_prob, is_remove_duplicate=False) - if label is None: - return text - label = self.decode(label[:, 1:]) - else: - if isinstance(preds, torch.Tensor): - preds = preds.numpy() - preds_idx = preds.argmax(axis=2) - preds_prob = preds.max(axis=2) - text = self.decode(preds_idx, preds_prob, is_remove_duplicate=False) - if label is None: - return text - label = self.decode(label[:, 1:]) - return text, label - - def add_special_char(self, dict_character): - dict_character = ['blank', '', '', ''] + dict_character - return dict_character - - def decode(self, text_index, text_prob=None, is_remove_duplicate=False): - """ convert text-index into text-label. """ - result_list = [] - batch_size = len(text_index) - for batch_idx in range(batch_size): - char_list = [] - conf_list = [] - for idx in range(len(text_index[batch_idx])): - try: - char_idx = self.character[int(text_index[batch_idx][idx])] - except: - continue - if char_idx == '': # end - break - char_list.append(char_idx) - if text_prob is not None: - conf_list.append(text_prob[batch_idx][idx]) - else: - conf_list.append(1) - text = ''.join(char_list) - result_list.append((text.lower(), np.mean(conf_list).tolist())) - return result_list - -class ViTSTRLabelDecode(NRTRLabelDecode): - """ Convert between text-label and text-index """ - - def __init__(self, character_dict_path=None, use_space_char=False, - **kwargs): - super(ViTSTRLabelDecode, self).__init__(character_dict_path, - use_space_char) - - def __call__(self, preds, label=None, *args, **kwargs): - if isinstance(preds, torch.Tensor): - preds = preds[:, 1:].numpy() - else: - preds = preds[:, 1:] - preds_idx = preds.argmax(axis=2) - preds_prob = preds.max(axis=2) - text = self.decode(preds_idx, preds_prob, is_remove_duplicate=False) - if label is None: - return text - label = self.decode(label[:, 1:]) - return text, label - - def add_special_char(self, dict_character): - dict_character = ['', ''] + dict_character - return dict_character - - -class AttnLabelDecode(BaseRecLabelDecode): - """ Convert between text-label and text-index """ - - def __init__(self, - character_dict_path=None, - use_space_char=False, - **kwargs): - super(AttnLabelDecode, self).__init__(character_dict_path, - use_space_char) - - def add_special_char(self, dict_character): - self.beg_str = "sos" - self.end_str = "eos" - dict_character = dict_character - dict_character = [self.beg_str] + dict_character + [self.end_str] - return dict_character - - def decode(self, text_index, text_prob=None, is_remove_duplicate=False): - """ convert text-index into text-label. """ - result_list = [] - ignored_tokens = self.get_ignored_tokens() - [beg_idx, end_idx] = self.get_ignored_tokens() - batch_size = len(text_index) - for batch_idx in range(batch_size): - char_list = [] - conf_list = [] - for idx in range(len(text_index[batch_idx])): - if text_index[batch_idx][idx] in ignored_tokens: - continue - if int(text_index[batch_idx][idx]) == int(end_idx): - break - if is_remove_duplicate: - # only for predict - if idx > 0 and text_index[batch_idx][idx - 1] == text_index[ - batch_idx][idx]: - continue - char_list.append(self.character[int(text_index[batch_idx][ - idx])]) - if text_prob is not None: - conf_list.append(text_prob[batch_idx][idx]) - else: - conf_list.append(1) - text = ''.join(char_list) - result_list.append((text, np.mean(conf_list))) - return result_list - - def __call__(self, preds, label=None, *args, **kwargs): - """ - text = self.decode(text) - if label is None: - return text - else: - label = self.decode(label, is_remove_duplicate=False) - return text, label - """ - if isinstance(preds, torch.Tensor): - preds = preds.cpu().numpy() - - preds_idx = preds.argmax(axis=2) - preds_prob = preds.max(axis=2) - text = self.decode(preds_idx, preds_prob, is_remove_duplicate=False) - if label is None: - return text - label = self.decode(label, is_remove_duplicate=False) - return text, label - - def get_ignored_tokens(self): - beg_idx = self.get_beg_end_flag_idx("beg") - end_idx = self.get_beg_end_flag_idx("end") - return [beg_idx, end_idx] - - def get_beg_end_flag_idx(self, beg_or_end): - if beg_or_end == "beg": - idx = np.array(self.dict[self.beg_str]) - elif beg_or_end == "end": - idx = np.array(self.dict[self.end_str]) - else: - assert False, "unsupport type %s in get_beg_end_flag_idx" \ - % beg_or_end - return idx - - -class RFLLabelDecode(BaseRecLabelDecode): - """ Convert between text-label and text-index """ - - def __init__(self, character_dict_path=None, use_space_char=False, - **kwargs): - super(RFLLabelDecode, self).__init__(character_dict_path, - use_space_char) - - def add_special_char(self, dict_character): - self.beg_str = "sos" - self.end_str = "eos" - dict_character = dict_character - dict_character = [self.beg_str] + dict_character + [self.end_str] - return dict_character - - def decode(self, text_index, text_prob=None, is_remove_duplicate=False): - """ convert text-index into text-label. """ - result_list = [] - ignored_tokens = self.get_ignored_tokens() - [beg_idx, end_idx] = self.get_ignored_tokens() - batch_size = len(text_index) - for batch_idx in range(batch_size): - char_list = [] - conf_list = [] - for idx in range(len(text_index[batch_idx])): - if text_index[batch_idx][idx] in ignored_tokens: - continue - if int(text_index[batch_idx][idx]) == int(end_idx): - break - if is_remove_duplicate: - # only for predict - if idx > 0 and text_index[batch_idx][idx - 1] == text_index[ - batch_idx][idx]: - continue - char_list.append(self.character[int(text_index[batch_idx][ - idx])]) - if text_prob is not None: - conf_list.append(text_prob[batch_idx][idx]) - else: - conf_list.append(1) - text = ''.join(char_list) - result_list.append((text, np.mean(conf_list).tolist())) - return result_list - - def __call__(self, preds, label=None, *args, **kwargs): - # if seq_outputs is not None: - if isinstance(preds, tuple) or isinstance(preds, list): - cnt_outputs, seq_outputs = preds - if isinstance(seq_outputs, torch.Tensor): - seq_outputs = seq_outputs.numpy() - preds_idx = seq_outputs.argmax(axis=2) - preds_prob = seq_outputs.max(axis=2) - text = self.decode(preds_idx, preds_prob, is_remove_duplicate=False) - - if label is None: - return text - label = self.decode(label, is_remove_duplicate=False) - return text, label - - else: - cnt_outputs = preds - if isinstance(cnt_outputs, torch.Tensor): - cnt_outputs = cnt_outputs.numpy() - cnt_length = [] - for lens in cnt_outputs: - length = round(np.sum(lens)) - cnt_length.append(length) - if label is None: - return cnt_length - label = self.decode(label, is_remove_duplicate=False) - length = [len(res[0]) for res in label] - return cnt_length, length - - def get_ignored_tokens(self): - beg_idx = self.get_beg_end_flag_idx("beg") - end_idx = self.get_beg_end_flag_idx("end") - return [beg_idx, end_idx] - - def get_beg_end_flag_idx(self, beg_or_end): - if beg_or_end == "beg": - idx = np.array(self.dict[self.beg_str]) - elif beg_or_end == "end": - idx = np.array(self.dict[self.end_str]) - else: - assert False, "unsupport type %s in get_beg_end_flag_idx" \ - % beg_or_end - return idx - - -class SRNLabelDecode(BaseRecLabelDecode): - """ Convert between text-label and text-index """ - - def __init__(self, - character_dict_path=None, - use_space_char=False, - **kwargs): - self.max_text_length = kwargs.get('max_text_length', 25) - super(SRNLabelDecode, self).__init__(character_dict_path, - use_space_char) - - def __call__(self, preds, label=None, *args, **kwargs): - pred = preds['predict'] - char_num = len(self.character_str) + 2 - if isinstance(pred, torch.Tensor): - pred = pred.numpy() - pred = np.reshape(pred, [-1, char_num]) - - preds_idx = np.argmax(pred, axis=1) - preds_prob = np.max(pred, axis=1) - - preds_idx = np.reshape(preds_idx, [-1, self.max_text_length]) - - preds_prob = np.reshape(preds_prob, [-1, self.max_text_length]) - - text = self.decode(preds_idx, preds_prob) - - if label is None: - text = self.decode(preds_idx, preds_prob, is_remove_duplicate=False) - return text - label = self.decode(label) - return text, label - - def decode(self, text_index, text_prob=None, is_remove_duplicate=False): - """ convert text-index into text-label. """ - result_list = [] - ignored_tokens = self.get_ignored_tokens() - batch_size = len(text_index) - - for batch_idx in range(batch_size): - char_list = [] - conf_list = [] - for idx in range(len(text_index[batch_idx])): - if text_index[batch_idx][idx] in ignored_tokens: - continue - if is_remove_duplicate: - # only for predict - if idx > 0 and text_index[batch_idx][idx - 1] == text_index[ - batch_idx][idx]: - continue - char_list.append(self.character[int(text_index[batch_idx][ - idx])]) - if text_prob is not None: - conf_list.append(text_prob[batch_idx][idx]) - else: - conf_list.append(1) - - text = ''.join(char_list) - result_list.append((text, np.mean(conf_list))) - return result_list - - def add_special_char(self, dict_character): - dict_character = dict_character + [self.beg_str, self.end_str] - return dict_character - - def get_ignored_tokens(self): - beg_idx = self.get_beg_end_flag_idx("beg") - end_idx = self.get_beg_end_flag_idx("end") - return [beg_idx, end_idx] - - def get_beg_end_flag_idx(self, beg_or_end): - if beg_or_end == "beg": - idx = np.array(self.dict[self.beg_str]) - elif beg_or_end == "end": - idx = np.array(self.dict[self.end_str]) - else: - assert False, "unsupport type %s in get_beg_end_flag_idx" \ - % beg_or_end - return idx - - -class TableLabelDecode(object): - """ """ - - def __init__(self, - character_dict_path, - **kwargs): - list_character, list_elem = self.load_char_elem_dict(character_dict_path) - list_character = self.add_special_char(list_character) - list_elem = self.add_special_char(list_elem) - self.dict_character = {} - self.dict_idx_character = {} - for i, char in enumerate(list_character): - self.dict_idx_character[i] = char - self.dict_character[char] = i - self.dict_elem = {} - self.dict_idx_elem = {} - for i, elem in enumerate(list_elem): - self.dict_idx_elem[i] = elem - self.dict_elem[elem] = i - - def load_char_elem_dict(self, character_dict_path): - list_character = [] - list_elem = [] - with open(character_dict_path, "rb") as fin: - lines = fin.readlines() - substr = lines[0].decode('utf-8').strip("\n").strip("\r\n").split("\t") - character_num = int(substr[0]) - elem_num = int(substr[1]) - for cno in range(1, 1 + character_num): - character = lines[cno].decode('utf-8').strip("\n").strip("\r\n") - list_character.append(character) - for eno in range(1 + character_num, 1 + character_num + elem_num): - elem = lines[eno].decode('utf-8').strip("\n").strip("\r\n") - list_elem.append(elem) - return list_character, list_elem - - def add_special_char(self, list_character): - self.beg_str = "sos" - self.end_str = "eos" - list_character = [self.beg_str] + list_character + [self.end_str] - return list_character - - def __call__(self, preds): - structure_probs = preds['structure_probs'] - loc_preds = preds['loc_preds'] - if isinstance(structure_probs,torch.Tensor): - structure_probs = structure_probs.numpy() - if isinstance(loc_preds,torch.Tensor): - loc_preds = loc_preds.numpy() - structure_idx = structure_probs.argmax(axis=2) - structure_probs = structure_probs.max(axis=2) - structure_str, structure_pos, result_score_list, result_elem_idx_list = self.decode(structure_idx, - structure_probs, 'elem') - res_html_code_list = [] - res_loc_list = [] - batch_num = len(structure_str) - for bno in range(batch_num): - res_loc = [] - for sno in range(len(structure_str[bno])): - text = structure_str[bno][sno] - if text in ['', ' 0 and tmp_elem_idx == end_idx: - break - if tmp_elem_idx in ignored_tokens: - continue - - char_list.append(current_dict[tmp_elem_idx]) - elem_pos_list.append(idx) - score_list.append(structure_probs[batch_idx, idx]) - elem_idx_list.append(tmp_elem_idx) - result_list.append(char_list) - result_pos_list.append(elem_pos_list) - result_score_list.append(score_list) - result_elem_idx_list.append(elem_idx_list) - return result_list, result_pos_list, result_score_list, result_elem_idx_list - - def get_ignored_tokens(self, char_or_elem): - beg_idx = self.get_beg_end_flag_idx("beg", char_or_elem) - end_idx = self.get_beg_end_flag_idx("end", char_or_elem) - return [beg_idx, end_idx] - - def get_beg_end_flag_idx(self, beg_or_end, char_or_elem): - if char_or_elem == "char": - if beg_or_end == "beg": - idx = self.dict_character[self.beg_str] - elif beg_or_end == "end": - idx = self.dict_character[self.end_str] - else: - assert False, "Unsupport type %s in get_beg_end_flag_idx of char" \ - % beg_or_end - elif char_or_elem == "elem": - if beg_or_end == "beg": - idx = self.dict_elem[self.beg_str] - elif beg_or_end == "end": - idx = self.dict_elem[self.end_str] - else: - assert False, "Unsupport type %s in get_beg_end_flag_idx of elem" \ - % beg_or_end - else: - assert False, "Unsupport type %s in char_or_elem" \ - % char_or_elem - return idx - - -class SARLabelDecode(BaseRecLabelDecode): - """ Convert between text-label and text-index """ - - def __init__(self, character_dict_path=None, use_space_char=False, - **kwargs): - super(SARLabelDecode, self).__init__(character_dict_path, - use_space_char) - - self.rm_symbol = kwargs.get('rm_symbol', False) - - def add_special_char(self, dict_character): - beg_end_str = "" - unknown_str = "" - padding_str = "" - dict_character = dict_character + [unknown_str] - self.unknown_idx = len(dict_character) - 1 - dict_character = dict_character + [beg_end_str] - self.start_idx = len(dict_character) - 1 - self.end_idx = len(dict_character) - 1 - dict_character = dict_character + [padding_str] - self.padding_idx = len(dict_character) - 1 - return dict_character - - def decode(self, text_index, text_prob=None, is_remove_duplicate=False): - """ convert text-index into text-label. """ - result_list = [] - ignored_tokens = self.get_ignored_tokens() - - batch_size = len(text_index) - for batch_idx in range(batch_size): - char_list = [] - conf_list = [] - for idx in range(len(text_index[batch_idx])): - if text_index[batch_idx][idx] in ignored_tokens: - continue - if int(text_index[batch_idx][idx]) == int(self.end_idx): - if text_prob is None and idx == 0: - continue - else: - break - if is_remove_duplicate: - # only for predict - if idx > 0 and text_index[batch_idx][idx - 1] == text_index[ - batch_idx][idx]: - continue - char_list.append(self.character[int(text_index[batch_idx][ - idx])]) - if text_prob is not None: - conf_list.append(text_prob[batch_idx][idx]) - else: - conf_list.append(1) - text = ''.join(char_list) - if self.rm_symbol: - comp = re.compile('[^A-Z^a-z^0-9^\u4e00-\u9fa5]') - text = text.lower() - text = comp.sub('', text) - result_list.append((text, np.mean(conf_list).tolist())) - return result_list - - def __call__(self, preds, label=None, *args, **kwargs): - if isinstance(preds, torch.Tensor): - preds = preds.cpu().numpy() - preds_idx = preds.argmax(axis=2) - preds_prob = preds.max(axis=2) - - text = self.decode(preds_idx, preds_prob, is_remove_duplicate=False) - - if label is None: - return text - label = self.decode(label, is_remove_duplicate=False) - return text, label - - def get_ignored_tokens(self): - return [self.padding_idx] - - -class CANLabelDecode(BaseRecLabelDecode): - """ Convert between latex-symbol and symbol-index """ - - def __init__(self, character_dict_path=None, use_space_char=False, - **kwargs): - super(CANLabelDecode, self).__init__(character_dict_path, - use_space_char) - - def decode(self, text_index, preds_prob=None): - result_list = [] - batch_size = len(text_index) - for batch_idx in range(batch_size): - seq_end = text_index[batch_idx].argmin(0) - idx_list = text_index[batch_idx][:seq_end].tolist() - symbol_list = [self.character[idx] for idx in idx_list] - probs = [] - if preds_prob is not None: - probs = preds_prob[batch_idx][:len(symbol_list)].tolist() - - result_list.append([' '.join(symbol_list), probs]) - return result_list - - def __call__(self, preds, label=None, *args, **kwargs): - pred_prob, _, _, _ = preds - preds_idx = pred_prob.argmax(axis=2) - - text = self.decode(preds_idx) - if label is None: - return text - label = self.decode(label) - return text, label \ No newline at end of file diff --git a/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/utils/__init__.py b/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/utils/__init__.py deleted file mode 100755 index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000 diff --git a/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/utils/resources/arch_config.yaml b/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/utils/resources/arch_config.yaml deleted file mode 100644 index 2dd3b633a8c13d5c450ebc93f84c8f59ae5c8d93..0000000000000000000000000000000000000000 --- a/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/utils/resources/arch_config.yaml +++ /dev/null @@ -1,476 +0,0 @@ -ch_ptocr_mobile_v2.0_cls_infer: - model_type: cls - algorithm: CLS - Transform: - Backbone: - name: MobileNetV3 - scale: 0.35 - model_name: small - Neck: - Head: - name: ClsHead - class_dim: 2 - -Multilingual_PP-OCRv3_det_infer: - model_type: det - algorithm: DB - Transform: - Backbone: - name: MobileNetV3 - scale: 0.5 - model_name: large - disable_se: True - Neck: - name: RSEFPN - out_channels: 96 - shortcut: True - Head: - name: DBHead - k: 50 - -en_PP-OCRv3_det_infer: - model_type: det - algorithm: DB - Transform: - Backbone: - name: MobileNetV3 - scale: 0.5 - model_name: large - disable_se: True - Neck: - name: RSEFPN - out_channels: 96 - shortcut: True - Head: - name: DBHead - k: 50 - -ch_PP-OCRv3_det_infer: - model_type: det - algorithm: DB - Transform: - Backbone: - name: MobileNetV3 - scale: 0.5 - model_name: large - disable_se: True - Neck: - name: RSEFPN - out_channels: 96 - shortcut: True - Head: - name: DBHead - k: 50 - -en_PP-OCRv4_rec_infer: - model_type: rec - algorithm: SVTR_LCNet - Transform: - Backbone: - name: PPLCNetV3 - scale: 0.95 - Head: - name: MultiHead - out_channels_list: - CTCLabelDecode: 97 #'blank' + ...(62) + ' ' - head_list: - - CTCHead: - Neck: - name: svtr - dims: 120 - depth: 2 - hidden_dims: 120 - kernel_size: [ 1, 3 ] - use_guide: True - Head: - fc_decay: 0.00001 - - NRTRHead: - nrtr_dim: 384 - max_text_length: 25 - -ch_PP-OCRv4_det_infer: - model_type: det - algorithm: DB - Transform: null - Backbone: - name: PPLCNetV3 - scale: 0.75 - det: True - Neck: - name: RSEFPN - out_channels: 96 - shortcut: True - Head: - name: DBHead - k: 50 - -ch_PP-OCRv5_det_infer: - model_type: det - algorithm: DB - Transform: null - Backbone: - name: PPLCNetV3 - scale: 0.75 - det: True - Neck: - name: RSEFPN - out_channels: 96 - shortcut: True - Head: - name: DBHead - k: 50 - -ch_PP-OCRv4_det_server_infer: - model_type: det - algorithm: DB - Transform: null - Backbone: - name: PPHGNet_small - det: True - Neck: - name: LKPAN - out_channels: 256 - intracl: true - Head: - name: PFHeadLocal - k: 50 - mode: "large" - -ch_PP-OCRv4_rec_infer: - model_type: rec - algorithm: SVTR_LCNet - Transform: - Backbone: - name: PPLCNetV3 - scale: 0.95 - Head: - name: MultiHead - out_channels_list: - CTCLabelDecode: 6625 #'blank' + ...(6623) + ' ' - head_list: - - CTCHead: - Neck: - name: svtr - dims: 120 - depth: 2 - hidden_dims: 120 - kernel_size: [ 1, 3 ] - use_guide: True - Head: - fc_decay: 0.00001 - - NRTRHead: - nrtr_dim: 384 - max_text_length: 25 - -ch_PP-OCRv4_rec_server_infer: - model_type: rec - algorithm: SVTR_HGNet - Transform: - Backbone: - name: PPHGNet_small - Head: - name: MultiHead - out_channels_list: - CTCLabelDecode: 6625 #'blank' + ...(6623) + ' ' - head_list: - - CTCHead: - Neck: - name: svtr - dims: 120 - depth: 2 - hidden_dims: 120 - kernel_size: [ 1, 3 ] - use_guide: True - Head: - fc_decay: 0.00001 - - NRTRHead: - nrtr_dim: 384 - max_text_length: 25 - -ch_PP-OCRv4_rec_server_doc_infer: - model_type: rec - algorithm: SVTR_HGNet - Transform: - Backbone: - name: PPHGNet_small - Head: - name: MultiHead - out_channels_list: - CTCLabelDecode: 15631 - head_list: - - CTCHead: - Neck: - name: svtr - dims: 120 - depth: 2 - hidden_dims: 120 - kernel_size: [ 1, 3 ] - use_guide: True - Head: - fc_decay: 0.00001 - - NRTRHead: - nrtr_dim: 384 - max_text_length: 25 - -ch_PP-OCRv5_rec_server_infer: - model_type: rec - algorithm: SVTR_HGNet - Transform: - Backbone: - name: PPHGNetV2_B4 - text_rec: True - Head: - name: MultiHead - out_channels_list: - CTCLabelDecode: 18385 - head_list: - - CTCHead: - Neck: - name: svtr - dims: 120 - depth: 2 - hidden_dims: 120 - kernel_size: [ 1, 3 ] - use_guide: True - Head: - fc_decay: 0.00001 - - NRTRHead: - nrtr_dim: 384 - max_text_length: 25 - -ch_PP-OCRv5_rec_infer: - model_type: rec - algorithm: SVTR_HGNet - Transform: - Backbone: - name: PPLCNetV3 - scale: 0.95 - Head: - name: MultiHead - out_channels_list: - CTCLabelDecode: 18385 - head_list: - - CTCHead: - Neck: - name: svtr - dims: 120 - depth: 2 - hidden_dims: 120 - kernel_size: [ 1, 3 ] - use_guide: True - Head: - fc_decay: 0.00001 - - NRTRHead: - nrtr_dim: 384 - max_text_length: 25 - -chinese_cht_PP-OCRv3_rec_infer: - model_type: rec - algorithm: SVTR - Transform: - Backbone: - name: MobileNetV1Enhance - scale: 0.5 - last_conv_stride: [1, 2] - last_pool_type: avg - Neck: - name: SequenceEncoder - encoder_type: svtr - dims: 64 - depth: 2 - hidden_dims: 120 - use_guide: True - Head: - name: CTCHead -# out_channels: 8423 - fc_decay: 0.00001 - -latin_PP-OCRv3_rec_infer: - model_type: rec - algorithm: SVTR - Transform: - Backbone: - name: MobileNetV1Enhance - scale: 0.5 - last_conv_stride: [ 1, 2 ] - last_pool_type: avg - Neck: - name: SequenceEncoder - encoder_type: svtr - dims: 64 - depth: 2 - hidden_dims: 120 - use_guide: True - Head: - name: CTCHead -# out_channels: 187 - fc_decay: 0.00001 - -cyrillic_PP-OCRv3_rec_infer: - model_type: rec - algorithm: SVTR - Transform: - Backbone: - name: MobileNetV1Enhance - scale: 0.5 - last_conv_stride: [ 1, 2 ] - last_pool_type: avg - Neck: - name: SequenceEncoder - encoder_type: svtr - dims: 64 - depth: 2 - hidden_dims: 120 - use_guide: True - Head: - name: CTCHead -# out_channels: 165 - fc_decay: 0.00001 - -arabic_PP-OCRv3_rec_infer: - model_type: rec - algorithm: SVTR - Transform: - Backbone: - name: MobileNetV1Enhance - scale: 0.5 - last_conv_stride: [ 1, 2 ] - last_pool_type: avg - Neck: - name: SequenceEncoder - encoder_type: svtr - dims: 64 - depth: 2 - hidden_dims: 120 - use_guide: True - Head: - name: CTCHead -# out_channels: 164 - fc_decay: 0.00001 - -korean_PP-OCRv3_rec_infer: - model_type: rec - algorithm: SVTR - Transform: - Backbone: - name: MobileNetV1Enhance - scale: 0.5 - last_conv_stride: [ 1, 2 ] - last_pool_type: avg - Neck: - name: SequenceEncoder - encoder_type: svtr - dims: 64 - depth: 2 - hidden_dims: 120 - use_guide: True - Head: - name: CTCHead -# out_channels: 3690 - fc_decay: 0.00001 - -japan_PP-OCRv3_rec_infer: - model_type: rec - algorithm: SVTR - Transform: - Backbone: - name: MobileNetV1Enhance - scale: 0.5 - last_conv_stride: [ 1, 2 ] - last_pool_type: avg - Neck: - name: SequenceEncoder - encoder_type: svtr - dims: 64 - depth: 2 - hidden_dims: 120 - use_guide: True - Head: - name: CTCHead -# out_channels: 4401 - fc_decay: 0.00001 - -ta_PP-OCRv3_rec_infer: - model_type: rec - algorithm: SVTR - Transform: - Backbone: - name: MobileNetV1Enhance - scale: 0.5 - last_conv_stride: [ 1, 2 ] - last_pool_type: avg - Neck: - name: SequenceEncoder - encoder_type: svtr - dims: 64 - depth: 2 - hidden_dims: 120 - use_guide: True - Head: - name: CTCHead -# out_channels: 130 - fc_decay: 0.00001 - -te_PP-OCRv3_rec_infer: - model_type: rec - algorithm: SVTR - Transform: - Backbone: - name: MobileNetV1Enhance - scale: 0.5 - last_conv_stride: [ 1, 2 ] - last_pool_type: avg - Neck: - name: SequenceEncoder - encoder_type: svtr - dims: 64 - depth: 2 - hidden_dims: 120 - use_guide: True - Head: - name: CTCHead -# out_channels: 153 - fc_decay: 0.00001 - -ka_PP-OCRv3_rec_infer: - model_type: rec - algorithm: SVTR - Transform: - Backbone: - name: MobileNetV1Enhance - scale: 0.5 - last_conv_stride: [ 1, 2 ] - last_pool_type: avg - Neck: - name: SequenceEncoder - encoder_type: svtr - dims: 64 - depth: 2 - hidden_dims: 120 - use_guide: True - Head: - name: CTCHead -# out_channels: 155 - fc_decay: 0.00001 - -devanagari_PP-OCRv3_rec_infer: - model_type: rec - algorithm: SVTR - Transform: - Backbone: - name: MobileNetV1Enhance - scale: 0.5 - last_conv_stride: [ 1, 2 ] - last_pool_type: avg - Neck: - name: SequenceEncoder - encoder_type: svtr - dims: 64 - depth: 2 - hidden_dims: 120 - use_guide: True - Head: - name: CTCHead -# out_channels: 169 - fc_decay: 0.00001 - diff --git a/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/utils/resources/dict/arabic_dict.txt b/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/utils/resources/dict/arabic_dict.txt deleted file mode 100644 index e97abf39274df77fbad066ee4635aebc6743140c..0000000000000000000000000000000000000000 --- a/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/utils/resources/dict/arabic_dict.txt +++ /dev/null @@ -1,162 +0,0 @@ - -! -# -$ -% -& -' -( -+ -, -- -. -/ -0 -1 -2 -3 -4 -5 -6 -7 -8 -9 -: -? -@ -A -B -C -D -E -F -G -H -I -J -K -L -M -N -O -P -Q -R -S -T -U -V -W -X -Y -Z -_ -a -b -c -d -e -f -g -h -i -j -k -l -m -n -o -p -q -r -s -t -u -v -w -x -y -z -É -é -ء -آ -أ -ؤ -إ -ئ -ا -ب -ة -ت -ث -ج -ح -خ -د -ذ -ر -ز -س -ش -ص -ض -ط -ظ -ع -غ -ف -ق -ك -ل -م -ن -ه -و -ى -ي -ً -ٌ -ٍ -َ -ُ -ِ -ّ -ْ -ٓ -ٔ -ٰ -ٱ -ٹ -پ -چ -ڈ -ڑ -ژ -ک -ڭ -گ -ں -ھ -ۀ -ہ -ۂ -ۃ -ۆ -ۇ -ۈ -ۋ -ی -ې -ے -ۓ -ە -١ -٢ -٣ -٤ -٥ -٦ -٧ -٨ -٩ diff --git a/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/utils/resources/dict/chinese_cht_dict.txt b/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/utils/resources/dict/chinese_cht_dict.txt deleted file mode 100644 index cc1aa4724b9a6f0e15275bcf61c91c26b6550c3e..0000000000000000000000000000000000000000 --- a/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/utils/resources/dict/chinese_cht_dict.txt +++ /dev/null @@ -1,8421 +0,0 @@ -! -" -# -$ -% -& -' -( -) -* -+ -, -- -. -/ -0 -1 -2 -3 -4 -5 -6 -7 -8 -9 -: -; -< -= -> -? -@ -A -B -C -D -E -F -G -H -I -J -K -L -M -N -O -P -Q -R -S -T -U -V -W -X -Y -Z -[ -\ -] -^ -_ -` -a -b -c -d -e -f -g -h -i -j -k -l -m -n -o -p -q -r -s -t -u -v -w -x -y -z -{ -| -} -~ -¥ -® -° -± -² -´ -· -» -É -Ë -Ó -× -Ü -à -á -ä -è -é -ì -í -ò -ó -÷ -ú -ü -ā -ē -ī -ō -ū -ǐ -ǒ -ɔ -ɡ -ʌ -ˋ -Λ -Ο -Φ -Ω -α -β -ε -θ -μ -π -З -И -Й -П -Я -г -— -‖ -‘ -’ -“ -” -• -… -‧ -′ -″ -※ -℃ -№ -™ -Ⅱ -Ⅲ -Ⅳ -← -↑ -→ -↓ -⇋ -∈ -∑ -√ -∞ -∣ -∧ -∩ -∫ -∶ -≈ -≠ -≤ -≥ -⊙ -⊥ -① -② -③ -④ -⑧ -⑴ -⑵ -⑶ -─ -│ -┅ -┌ -├ -█ -▎ -▏ -▕ -■ -□ -▪ -▲ -△ -▼ -◆ -◇ -○ -◎ -● -◥ -★ -☆ -❋ -❤ -  -、 -。 -〇 -〉 -《 -》 -「 -」 -『 -』 -【 -】 -〔 -〕 -〖 -〗 -の -サ -シ -ジ -マ -ㄱ -ㆍ -㎏ -㎡ -㐂 -㐱 -㙟 -㴪 -㸃 -䖝 -䝉 -䰾 -䲁 -一 -丁 -七 -丄 -丈 -三 -上 -下 -丌 -不 -与 -丏 -丐 -丑 -且 -丕 -世 -丘 -丙 -丞 -丟 -両 -並 -丨 -丫 -中 -丰 -串 -丶 -丸 -丹 -主 -丼 -丿 -乂 -乃 -久 -么 -之 -乍 -乎 -乏 -乒 -乓 -乖 -乗 -乘 -乙 -乚 -乜 -九 -乞 -也 -乩 -乭 -乳 -乸 -乹 -乾 -亀 -亂 -亅 -了 -予 -亊 -事 -二 -亍 -云 -互 -亓 -五 -井 -亘 -些 -亜 -亞 -亟 -亠 -亡 -亢 -交 -亥 -亦 -亨 -享 -京 -亭 -亮 -亰 -亳 -亶 -亹 -人 -亻 -什 -仁 -仂 -仃 -仄 -仇 -仉 -今 -介 -仍 -仏 -仔 -仕 -他 -仗 -付 -仙 -仛 -仝 -仞 -仟 -仡 -代 -令 -以 -仨 -仫 -仮 -仰 -仲 -仳 -仵 -件 -仺 -任 -仼 -份 -仿 -企 -伃 -伈 -伉 -伊 -伋 -伍 -伎 -伏 -伐 -休 -伕 -伙 -伝 -伢 -伯 -估 -伱 -伴 -伶 -伷 -伸 -伺 -似 -伽 -伾 -佀 -佁 -佃 -但 -佇 -佈 -佉 -佋 -位 -低 -住 -佐 -佑 -体 -佔 -何 -佗 -佘 -余 -佚 -佛 -作 -佝 -佞 -佟 -你 -佣 -佤 -佧 -佩 -佬 -佯 -佰 -佳 -併 -佶 -佹 -佺 -佼 -佾 -使 -侁 -侃 -侄 -侅 -來 -侈 -侊 -例 -侍 -侏 -侑 -侖 -侗 -侘 -侚 -供 -依 -侞 -価 -侮 -侯 -侵 -侶 -侷 -侹 -便 -俁 -係 -促 -俄 -俅 -俊 -俋 -俌 -俍 -俎 -俏 -俐 -俑 -俗 -俘 -俚 -俛 -保 -俞 -俟 -俠 -信 -俬 -修 -俯 -俱 -俳 -俴 -俵 -俶 -俸 -俺 -俽 -俾 -倆 -倈 -倉 -個 -倌 -倍 -們 -倒 -倓 -倔 -倖 -倗 -倘 -候 -倚 -倜 -倞 -借 -倡 -倢 -倣 -値 -倦 -倧 -倩 -倪 -倫 -倬 -倭 -倮 -倻 -值 -偁 -偃 -假 -偈 -偉 -偊 -偌 -偍 -偎 -偏 -偓 -偕 -做 -停 -健 -偪 -偲 -側 -偵 -偶 -偷 -偸 -偽 -傀 -傃 -傅 -傈 -傉 -傍 -傑 -傒 -傕 -傖 -傘 -備 -傜 -傢 -傣 -催 -傭 -傲 -傳 -債 -傷 -傻 -傾 -僅 -僉 -僊 -働 -像 -僑 -僔 -僕 -僖 -僙 -僚 -僜 -僡 -僧 -僩 -僭 -僮 -僰 -僱 -僳 -僴 -僵 -價 -僻 -儀 -儁 -儂 -億 -儆 -儇 -儈 -儉 -儋 -儐 -儒 -儔 -儕 -儘 -儚 -儞 -償 -儡 -儥 -儦 -優 -儫 -儱 -儲 -儷 -儺 -儻 -儼 -兀 -允 -元 -兄 -充 -兆 -先 -光 -克 -兌 -免 -児 -兒 -兔 -兕 -兗 -兜 -入 -內 -全 -兩 -兪 -八 -公 -六 -兮 -共 -兵 -其 -具 -典 -兼 -兿 -冀 -冂 -円 -冇 -冉 -冊 -再 -冏 -冑 -冒 -冕 -冖 -冗 -冚 -冠 -冢 -冤 -冥 -冧 -冨 -冪 -冫 -冬 -冮 -冰 -冴 -冶 -冷 -冼 -冽 -凃 -凄 -准 -凈 -凋 -凌 -凍 -凖 -凜 -凝 -凞 -几 -凡 -処 -凪 -凬 -凰 -凱 -凳 -凵 -凶 -凸 -凹 -出 -函 -刀 -刁 -刂 -刃 -刄 -分 -切 -刈 -刊 -刎 -刑 -划 -列 -初 -判 -別 -刦 -刧 -刨 -利 -刪 -刮 -到 -制 -刷 -券 -刺 -刻 -刼 -剁 -剃 -則 -削 -剋 -剌 -前 -剎 -剏 -剔 -剖 -剛 -剝 -剡 -剣 -剩 -剪 -剮 -副 -割 -創 -剿 -劃 -劄 -劇 -劈 -劉 -劊 -劌 -劍 -劑 -劔 -力 -功 -加 -劣 -助 -努 -劫 -劬 -劭 -劵 -効 -劼 -劾 -勁 -勃 -勅 -勇 -勉 -勐 -勑 -勒 -勔 -動 -勖 -勗 -勘 -務 -勛 -勝 -勞 -募 -勢 -勣 -勤 -勦 -勰 -勱 -勲 -勳 -勵 -勷 -勸 -勺 -勻 -勾 -勿 -匂 -匄 -包 -匆 -匈 -匋 -匍 -匏 -匐 -匕 -化 -北 -匙 -匚 -匝 -匠 -匡 -匣 -匪 -匯 -匱 -匸 -匹 -匾 -匿 -區 -十 -千 -卅 -升 -午 -卉 -半 -卋 -卍 -卐 -卑 -卒 -卓 -協 -南 -博 -卜 -卞 -卟 -占 -卡 -卣 -卦 -卧 -卩 -卬 -卮 -卯 -印 -危 -卲 -即 -卵 -卷 -卸 -卹 -卺 -卻 -卽 -卿 -厄 -厓 -厔 -厙 -厚 -厝 -原 -厥 -厭 -厰 -厲 -厴 -厶 -去 -參 -叄 -又 -叉 -及 -友 -反 -収 -叔 -叕 -取 -受 -叛 -叟 -叡 -叢 -口 -古 -句 -另 -叨 -叩 -只 -叫 -召 -叭 -叮 -可 -台 -叱 -史 -右 -叵 -司 -叻 -叼 -吁 -吃 -各 -吆 -合 -吉 -吊 -吋 -同 -名 -后 -吏 -吐 -向 -吒 -吔 -吖 -君 -吝 -吞 -吟 -吠 -吡 -吥 -否 -吧 -吩 -含 -吮 -吱 -吲 -吳 -吵 -吶 -吸 -吹 -吻 -吼 -吾 -呀 -呂 -呃 -呈 -呉 -告 -呋 -呎 -呢 -呤 -呦 -周 -呱 -味 -呵 -呷 -呸 -呼 -命 -呾 -咀 -咁 -咂 -咄 -咅 -咆 -咋 -和 -咎 -咑 -咒 -咔 -咕 -咖 -咗 -咘 -咚 -咟 -咤 -咥 -咧 -咨 -咩 -咪 -咫 -咬 -咭 -咯 -咱 -咲 -咳 -咸 -咻 -咼 -咽 -咾 -咿 -哀 -品 -哂 -哄 -哆 -哇 -哈 -哉 -哌 -哎 -哏 -哐 -哖 -哚 -哞 -員 -哥 -哦 -哨 -哩 -哪 -哭 -哮 -哱 -哲 -哺 -哼 -唃 -唄 -唆 -唇 -唉 -唏 -唐 -唑 -唔 -唘 -唧 -唫 -唬 -唭 -售 -唯 -唱 -唳 -唵 -唷 -唸 -唻 -唾 -啁 -啃 -啄 -商 -啉 -啊 -啍 -問 -啓 -啖 -啚 -啜 -啞 -啟 -啡 -啣 -啤 -啥 -啦 -啪 -啫 -啯 -啰 -啱 -啲 -啵 -啶 -啷 -啻 -啼 -啾 -喀 -喂 -喃 -善 -喆 -喇 -喈 -喉 -喊 -喋 -喏 -喔 -喘 -喙 -喚 -喜 -喝 -喢 -喦 -喧 -喪 -喫 -喬 -單 -喰 -喱 -喲 -喳 -喵 -喹 -喻 -喼 -嗄 -嗅 -嗆 -嗇 -嗊 -嗎 -嗑 -嗒 -嗓 -嗔 -嗖 -嗚 -嗜 -嗝 -嗞 -嗡 -嗢 -嗣 -嗦 -嗨 -嗩 -嗪 -嗮 -嗯 -嗲 -嗶 -嗹 -嗽 -嘀 -嘅 -嘆 -嘉 -嘌 -嘍 -嘎 -嘏 -嘔 -嘗 -嘚 -嘛 -嘜 -嘞 -嘟 -嘢 -嘣 -嘥 -嘧 -嘩 -嘬 -嘮 -嘯 -嘰 -嘲 -嘴 -嘶 -嘸 -嘹 -嘻 -嘿 -噁 -噌 -噍 -噏 -噓 -噗 -噝 -噠 -噢 -噤 -噥 -噦 -器 -噩 -噪 -噬 -噯 -噰 -噲 -噴 -噶 -噸 -噹 -噻 -嚇 -嚈 -嚎 -嚏 -嚐 -嚒 -嚓 -嚕 -嚗 -嚙 -嚞 -嚟 -嚤 -嚦 -嚧 -嚨 -嚩 -嚮 -嚳 -嚴 -嚶 -嚷 -嚼 -嚿 -囀 -囂 -囃 -囉 -囊 -囍 -囑 -囒 -囓 -囗 -囚 -四 -囝 -回 -因 -囡 -団 -囤 -囧 -囪 -囮 -囯 -困 -囲 -図 -囶 -囷 -囹 -固 -囿 -圂 -圃 -圄 -圈 -圉 -國 -圍 -圏 -園 -圓 -圖 -圗 -團 -圜 -土 -圧 -在 -圩 -圪 -圭 -圯 -地 -圳 -圻 -圾 -址 -均 -坊 -坋 -坌 -坍 -坎 -坐 -坑 -坖 -坡 -坣 -坤 -坦 -坨 -坩 -坪 -坫 -坬 -坭 -坮 -坯 -坳 -坵 -坶 -坷 -坻 -垂 -垃 -垈 -型 -垍 -垓 -垕 -垚 -垛 -垞 -垟 -垠 -垢 -垣 -垮 -垯 -垰 -垵 -垸 -垻 -垿 -埃 -埅 -埇 -埈 -埋 -埌 -城 -埏 -埒 -埔 -埕 -埗 -埜 -域 -埠 -埡 -埤 -埧 -埨 -埪 -埭 -埮 -埴 -埵 -執 -培 -基 -埻 -埼 -堀 -堂 -堃 -堅 -堆 -堇 -堈 -堉 -堊 -堍 -堖 -堝 -堡 -堤 -堦 -堪 -堮 -堯 -堰 -報 -場 -堵 -堷 -堺 -塀 -塅 -塆 -塊 -塋 -塌 -塍 -塏 -塑 -塔 -塗 -塘 -塙 -塜 -塞 -塡 -塢 -塤 -塨 -塩 -填 -塬 -塭 -塰 -塱 -塲 -塵 -塹 -塽 -塾 -墀 -境 -墅 -墉 -墊 -墎 -墓 -増 -墘 -墜 -增 -墟 -墡 -墣 -墨 -墩 -墫 -墬 -墮 -墱 -墳 -墺 -墼 -墾 -壁 -壄 -壆 -壇 -壋 -壌 -壎 -壐 -壑 -壓 -壔 -壕 -壘 -壙 -壞 -壟 -壠 -壢 -壤 -壩 -士 -壬 -壯 -壱 -壴 -壹 -壺 -壽 -夀 -夆 -変 -夊 -夋 -夌 -夏 -夔 -夕 -外 -夙 -多 -夜 -夠 -夢 -夤 -夥 -大 -天 -太 -夫 -夬 -夭 -央 -夯 -失 -夷 -夾 -奀 -奄 -奇 -奈 -奉 -奎 -奏 -奐 -契 -奓 -奔 -奕 -套 -奘 -奚 -奠 -奢 -奣 -奧 -奩 -奪 -奫 -奭 -奮 -女 -奴 -奶 -她 -好 -妀 -妁 -如 -妃 -妄 -妊 -妍 -妏 -妑 -妒 -妓 -妖 -妙 -妝 -妞 -妠 -妤 -妥 -妧 -妨 -妭 -妮 -妯 -妲 -妳 -妸 -妹 -妺 -妻 -妾 -姀 -姁 -姃 -姆 -姈 -姉 -姊 -始 -姌 -姍 -姐 -姑 -姒 -姓 -委 -姚 -姜 -姝 -姣 -姥 -姦 -姨 -姪 -姫 -姬 -姮 -姵 -姶 -姸 -姻 -姿 -威 -娃 -娉 -娋 -娌 -娍 -娎 -娑 -娖 -娘 -娛 -娜 -娟 -娠 -娣 -娥 -娩 -娫 -娳 -娶 -娸 -娼 -娽 -婀 -婁 -婆 -婉 -婊 -婑 -婕 -婚 -婢 -婦 -婧 -婪 -婭 -婯 -婷 -婺 -婻 -婼 -婿 -媃 -媄 -媊 -媐 -媒 -媓 -媖 -媗 -媚 -媛 -媜 -媞 -媧 -媭 -媯 -媲 -媳 -媺 -媼 -媽 -媾 -媿 -嫁 -嫂 -嫄 -嫈 -嫉 -嫌 -嫖 -嫘 -嫚 -嫡 -嫣 -嫦 -嫩 -嫪 -嫲 -嫳 -嫵 -嫺 -嫻 -嬅 -嬈 -嬉 -嬋 -嬌 -嬗 -嬛 -嬝 -嬡 -嬤 -嬨 -嬪 -嬬 -嬭 -嬰 -嬴 -嬸 -嬾 -嬿 -孀 -孃 -孆 -孋 -孌 -子 -孑 -孔 -孕 -孖 -字 -存 -孚 -孛 -孜 -孝 -孟 -孢 -季 -孤 -孩 -孫 -孬 -孮 -孰 -孳 -孵 -學 -孺 -孻 -孽 -孿 -宀 -它 -宅 -宇 -守 -安 -宋 -完 -宍 -宏 -宓 -宕 -宗 -官 -宙 -定 -宛 -宜 -実 -客 -宣 -室 -宥 -宦 -宧 -宮 -宰 -害 -宴 -宵 -家 -宸 -容 -宿 -寀 -寁 -寂 -寄 -寅 -密 -寇 -寈 -寊 -富 -寐 -寒 -寓 -寔 -寕 -寖 -寗 -寘 -寛 -寜 -寞 -察 -寡 -寢 -寤 -寥 -實 -寧 -寨 -審 -寫 -寬 -寮 -寯 -寰 -寳 -寵 -寶 -寸 -寺 -対 -封 -専 -尃 -射 -將 -專 -尉 -尊 -尋 -對 -導 -小 -尐 -少 -尓 -尕 -尖 -尗 -尙 -尚 -尢 -尤 -尨 -尪 -尬 -就 -尷 -尹 -尺 -尻 -尼 -尾 -尿 -局 -屁 -屄 -居 -屆 -屇 -屈 -屋 -屌 -屍 -屎 -屏 -屐 -屑 -屓 -展 -屚 -屜 -屠 -屢 -層 -履 -屬 -屭 -屯 -山 -屹 -屺 -屻 -岀 -岈 -岌 -岐 -岑 -岔 -岡 -岢 -岣 -岧 -岩 -岪 -岫 -岬 -岰 -岱 -岳 -岵 -岷 -岸 -岻 -峁 -峅 -峇 -峋 -峍 -峒 -峘 -峙 -峚 -峠 -峨 -峩 -峪 -峭 -峯 -峰 -峴 -島 -峻 -峼 -峽 -崁 -崆 -崇 -崈 -崋 -崍 -崎 -崐 -崑 -崒 -崔 -崖 -崗 -崘 -崙 -崚 -崛 -崞 -崟 -崠 -崢 -崤 -崧 -崩 -崬 -崮 -崱 -崴 -崵 -崶 -崽 -嵇 -嵊 -嵋 -嵌 -嵎 -嵐 -嵒 -嵕 -嵖 -嵗 -嵙 -嵛 -嵜 -嵨 -嵩 -嵬 -嵮 -嵯 -嵰 -嵴 -嵻 -嵿 -嶁 -嶂 -嶃 -嶄 -嶇 -嶋 -嶌 -嶍 -嶒 -嶔 -嶗 -嶝 -嶠 -嶢 -嶦 -嶧 -嶪 -嶬 -嶰 -嶲 -嶴 -嶷 -嶸 -嶺 -嶼 -嶽 -巂 -巄 -巆 -巋 -巌 -巍 -巎 -巑 -巒 -巔 -巖 -巘 -巛 -川 -州 -巡 -巢 -工 -左 -巧 -巨 -巫 -差 -巰 -己 -已 -巳 -巴 -巶 -巷 -巻 -巽 -巾 -巿 -市 -布 -帆 -希 -帑 -帔 -帕 -帖 -帘 -帙 -帚 -帛 -帝 -帡 -帢 -帥 -師 -席 -帯 -帰 -帳 -帶 -帷 -常 -帽 -幀 -幃 -幄 -幅 -幌 -幔 -幕 -幗 -幚 -幛 -幟 -幡 -幢 -幣 -幪 -幫 -干 -平 -年 -幵 -幷 -幸 -幹 -幺 -幻 -幼 -幽 -幾 -庀 -庁 -広 -庇 -床 -序 -底 -庖 -店 -庚 -府 -庠 -庢 -庥 -度 -座 -庫 -庭 -庲 -庵 -庶 -康 -庸 -庹 -庼 -庾 -廁 -廂 -廄 -廆 -廈 -廉 -廊 -廋 -廌 -廍 -廑 -廓 -廔 -廕 -廖 -廙 -廚 -廝 -廞 -廟 -廠 -廡 -廢 -廣 -廧 -廨 -廩 -廬 -廰 -廱 -廳 -延 -廷 -廸 -建 -廻 -廼 -廿 -弁 -弄 -弅 -弇 -弈 -弉 -弊 -弋 -弍 -式 -弐 -弒 -弓 -弔 -引 -弖 -弗 -弘 -弛 -弟 -弢 -弦 -弧 -弨 -弩 -弭 -弱 -張 -強 -弸 -弼 -弾 -彀 -彄 -彅 -彆 -彈 -彊 -彌 -彎 -彐 -彔 -彖 -彗 -彘 -彙 -彜 -彞 -彠 -彡 -形 -彣 -彤 -彥 -彧 -彩 -彪 -彫 -彬 -彭 -彰 -影 -彳 -彷 -役 -彼 -彿 -往 -征 -徂 -待 -徇 -很 -徉 -徊 -律 -後 -徐 -徑 -徒 -得 -徘 -徙 -徜 -從 -徠 -御 -徧 -徨 -復 -循 -徫 -徬 -徭 -微 -徳 -徴 -徵 -德 -徸 -徹 -徽 -心 -忄 -必 -忉 -忌 -忍 -忐 -忑 -忒 -志 -忘 -忙 -応 -忝 -忞 -忠 -快 -忬 -忯 -忱 -忳 -念 -忻 -忽 -忿 -怍 -怎 -怒 -怕 -怖 -怙 -怛 -思 -怠 -怡 -急 -怦 -性 -怨 -怪 -怯 -怵 -恁 -恂 -恃 -恆 -恊 -恍 -恐 -恕 -恙 -恢 -恣 -恤 -恥 -恨 -恩 -恪 -恬 -恭 -息 -恰 -恵 -恿 -悄 -悅 -悆 -悉 -悌 -悍 -悔 -悖 -悚 -悛 -悝 -悞 -悟 -悠 -患 -悧 -您 -悪 -悰 -悲 -悳 -悵 -悶 -悸 -悼 -情 -惆 -惇 -惑 -惔 -惕 -惘 -惚 -惜 -惟 -惠 -惡 -惣 -惦 -惰 -惱 -惲 -想 -惶 -惹 -惺 -愁 -愃 -愆 -愈 -愉 -愍 -意 -愐 -愒 -愔 -愕 -愚 -愛 -愜 -感 -愣 -愧 -愨 -愫 -愭 -愴 -愷 -愼 -愾 -愿 -慄 -慈 -態 -慌 -慎 -慕 -慘 -慚 -慜 -慟 -慢 -慣 -慥 -慧 -慨 -慮 -慰 -慳 -慵 -慶 -慷 -慾 -憂 -憊 -憋 -憍 -憎 -憐 -憑 -憓 -憕 -憙 -憚 -憤 -憧 -憨 -憩 -憫 -憬 -憲 -憶 -憺 -憻 -憾 -懂 -懃 -懇 -懈 -應 -懋 -懌 -懍 -懐 -懣 -懦 -懮 -懲 -懵 -懶 -懷 -懸 -懺 -懼 -懽 -懾 -懿 -戀 -戇 -戈 -戊 -戌 -戍 -戎 -成 -我 -戒 -戔 -戕 -或 -戙 -戚 -戛 -戟 -戡 -戢 -戥 -戦 -戩 -截 -戮 -戰 -戱 -戲 -戳 -戴 -戶 -戸 -戻 -戽 -戾 -房 -所 -扁 -扆 -扇 -扈 -扉 -手 -扌 -才 -扎 -扒 -打 -扔 -托 -扙 -扛 -扞 -扣 -扥 -扦 -扭 -扮 -扯 -扳 -扶 -批 -扼 -找 -承 -技 -抃 -抄 -抇 -抉 -把 -抑 -抒 -抓 -投 -抖 -抗 -折 -抦 -披 -抬 -抱 -抵 -抹 -抻 -押 -抽 -抿 -拂 -拆 -拇 -拈 -拉 -拋 -拌 -拍 -拎 -拏 -拐 -拒 -拓 -拔 -拖 -拗 -拘 -拙 -拚 -招 -拜 -拝 -拡 -括 -拭 -拮 -拯 -拱 -拳 -拴 -拷 -拺 -拼 -拽 -拾 -拿 -持 -指 -按 -挎 -挑 -挖 -挙 -挨 -挪 -挫 -振 -挲 -挵 -挹 -挺 -挻 -挾 -捂 -捆 -捉 -捌 -捍 -捎 -捏 -捐 -捒 -捕 -捜 -捦 -捧 -捨 -捩 -捫 -捭 -捱 -捲 -捶 -捷 -捺 -捻 -掀 -掂 -掃 -掄 -掇 -授 -掉 -掌 -掏 -掐 -排 -掖 -掘 -掙 -掛 -掞 -掟 -掠 -採 -探 -掣 -接 -控 -推 -掩 -措 -掬 -掰 -掾 -揀 -揄 -揆 -揉 -揍 -描 -提 -插 -揔 -揖 -揚 -換 -握 -揪 -揭 -揮 -援 -揸 -揺 -損 -搏 -搐 -搓 -搔 -搖 -搗 -搜 -搞 -搠 -搢 -搪 -搬 -搭 -搳 -搴 -搵 -搶 -搽 -搾 -摂 -摒 -摔 -摘 -摜 -摞 -摟 -摠 -摧 -摩 -摭 -摯 -摳 -摴 -摵 -摶 -摸 -摹 -摺 -摻 -摽 -撃 -撇 -撈 -撐 -撒 -撓 -撕 -撖 -撙 -撚 -撞 -撣 -撤 -撥 -撩 -撫 -撬 -播 -撮 -撰 -撲 -撳 -撻 -撼 -撾 -撿 -擀 -擁 -擂 -擅 -擇 -擊 -擋 -操 -擎 -擒 -擔 -擘 -據 -擠 -擢 -擥 -擦 -擬 -擯 -擰 -擱 -擲 -擴 -擷 -擺 -擼 -擾 -攀 -攏 -攔 -攖 -攘 -攜 -攝 -攞 -攢 -攣 -攤 -攪 -攫 -攬 -支 -攴 -攵 -收 -攷 -攸 -改 -攻 -攽 -放 -政 -故 -效 -敍 -敎 -敏 -救 -敔 -敕 -敖 -敗 -敘 -教 -敝 -敞 -敟 -敢 -散 -敦 -敫 -敬 -敭 -敲 -整 -敵 -敷 -數 -敻 -敾 -斂 -斃 -文 -斌 -斎 -斐 -斑 -斕 -斖 -斗 -料 -斛 -斜 -斝 -斟 -斡 -斤 -斥 -斧 -斬 -斯 -新 -斷 -方 -於 -施 -斿 -旁 -旂 -旃 -旄 -旅 -旉 -旋 -旌 -旎 -族 -旖 -旗 -旙 -旛 -旡 -既 -日 -旦 -旨 -早 -旬 -旭 -旱 -旲 -旳 -旺 -旻 -旼 -旽 -旾 -旿 -昀 -昂 -昃 -昆 -昇 -昉 -昊 -昌 -昍 -明 -昏 -昐 -易 -昔 -昕 -昚 -昛 -昜 -昝 -昞 -星 -映 -昡 -昣 -昤 -春 -昧 -昨 -昪 -昫 -昭 -是 -昰 -昱 -昴 -昵 -昶 -昺 -晁 -時 -晃 -晈 -晉 -晊 -晏 -晗 -晙 -晚 -晛 -晝 -晞 -晟 -晤 -晦 -晧 -晨 -晩 -晪 -晫 -晭 -普 -景 -晰 -晳 -晴 -晶 -晷 -晸 -智 -晾 -暃 -暄 -暅 -暇 -暈 -暉 -暊 -暌 -暎 -暏 -暐 -暑 -暕 -暖 -暗 -暘 -暝 -暟 -暠 -暢 -暦 -暨 -暫 -暮 -暱 -暲 -暴 -暸 -暹 -暻 -暾 -曄 -曅 -曆 -曇 -曉 -曌 -曔 -曖 -曙 -曜 -曝 -曠 -曦 -曧 -曨 -曩 -曬 -曮 -曰 -曲 -曳 -更 -曶 -曷 -書 -曹 -曺 -曼 -曽 -曾 -替 -最 -會 -月 -有 -朊 -朋 -服 -朏 -朐 -朓 -朔 -朕 -朖 -朗 -望 -朝 -期 -朦 -朧 -木 -未 -末 -本 -札 -朱 -朴 -朵 -朶 -朽 -朿 -杁 -杉 -杋 -杌 -李 -杏 -材 -村 -杓 -杖 -杙 -杜 -杞 -束 -杠 -杣 -杤 -杧 -杬 -杭 -杯 -東 -杲 -杳 -杴 -杵 -杷 -杻 -杼 -松 -板 -极 -枇 -枉 -枋 -枏 -析 -枕 -枖 -林 -枚 -枛 -果 -枝 -枠 -枡 -枯 -枰 -枱 -枲 -枳 -架 -枷 -枸 -枹 -枼 -柁 -柃 -柄 -柉 -柊 -柎 -柏 -某 -柑 -柒 -染 -柔 -柘 -柚 -柜 -柝 -柞 -柟 -查 -柩 -柬 -柯 -柰 -柱 -柳 -柴 -柵 -柶 -柷 -査 -柾 -柿 -栃 -栄 -栐 -栒 -栓 -栜 -栝 -栞 -校 -栢 -栨 -栩 -株 -栲 -栴 -核 -根 -栻 -格 -栽 -桀 -桁 -桂 -桃 -桄 -桅 -框 -案 -桉 -桌 -桎 -桐 -桑 -桓 -桔 -桕 -桖 -桙 -桜 -桝 -桫 -桱 -桲 -桴 -桶 -桷 -桼 -桿 -梀 -梁 -梂 -梃 -梅 -梆 -梉 -梏 -梓 -梔 -梗 -梘 -條 -梟 -梠 -梢 -梣 -梧 -梨 -梫 -梭 -梯 -械 -梱 -梳 -梵 -梶 -梽 -棄 -棆 -棉 -棋 -棍 -棐 -棒 -棓 -棕 -棖 -棗 -棘 -棚 -棛 -棟 -棠 -棡 -棣 -棧 -棨 -棩 -棪 -棫 -森 -棱 -棲 -棵 -棶 -棹 -棺 -棻 -棼 -棽 -椅 -椆 -椇 -椋 -植 -椎 -椏 -椒 -椙 -椥 -椪 -椰 -椲 -椴 -椵 -椹 -椽 -椿 -楂 -楊 -楓 -楔 -楗 -楙 -楚 -楝 -楞 -楠 -楡 -楢 -楣 -楤 -楦 -楧 -楨 -楫 -業 -楮 -楯 -楳 -極 -楷 -楸 -楹 -楽 -楿 -概 -榆 -榊 -榍 -榎 -榑 -榔 -榕 -榖 -榗 -榘 -榛 -榜 -榞 -榢 -榣 -榤 -榦 -榧 -榨 -榫 -榭 -榮 -榲 -榴 -榷 -榻 -榿 -槀 -槁 -槃 -槊 -構 -槌 -槍 -槎 -槐 -槓 -槔 -槗 -様 -槙 -槤 -槩 -槭 -槰 -槱 -槲 -槳 -槺 -槻 -槼 -槽 -槿 -樀 -樁 -樂 -樅 -樆 -樊 -樋 -樑 -樓 -樗 -樘 -標 -樞 -樟 -模 -樣 -樨 -権 -樫 -樵 -樸 -樹 -樺 -樻 -樽 -樾 -橄 -橇 -橈 -橋 -橐 -橒 -橓 -橘 -橙 -橚 -機 -橡 -橢 -橪 -橫 -橿 -檀 -檄 -檇 -檉 -檊 -檎 -檐 -檔 -檗 -檜 -檞 -檠 -檡 -檢 -檣 -檦 -檨 -檫 -檬 -檯 -檳 -檵 -檸 -檻 -檽 -櫂 -櫃 -櫆 -櫈 -櫓 -櫚 -櫛 -櫞 -櫟 -櫥 -櫨 -櫪 -櫱 -櫸 -櫻 -櫾 -櫿 -欄 -欉 -權 -欏 -欒 -欖 -欞 -欠 -次 -欣 -欥 -欲 -欸 -欹 -欺 -欽 -款 -歆 -歇 -歉 -歊 -歌 -歎 -歐 -歓 -歙 -歛 -歡 -止 -正 -此 -步 -武 -歧 -歩 -歪 -歲 -歳 -歴 -歷 -歸 -歹 -死 -歿 -殂 -殃 -殄 -殆 -殉 -殊 -殑 -殖 -殘 -殛 -殞 -殟 -殤 -殭 -殮 -殯 -殲 -殳 -段 -殷 -殺 -殻 -殼 -殿 -毀 -毅 -毆 -毉 -毋 -毌 -母 -毎 -每 -毐 -毒 -毓 -比 -毖 -毗 -毘 -毛 -毫 -毬 -毯 -毴 -毸 -毽 -毿 -氂 -氈 -氍 -氏 -氐 -民 -氓 -氖 -気 -氘 -氙 -氚 -氛 -氟 -氣 -氦 -氧 -氨 -氪 -氫 -氬 -氮 -氯 -氰 -水 -氵 -氷 -永 -氹 -氻 -氽 -氾 -汀 -汁 -求 -汊 -汎 -汐 -汕 -汗 -汛 -汜 -汝 -汞 -江 -池 -污 -汧 -汨 -汩 -汪 -汭 -汰 -汲 -汴 -汶 -決 -汽 -汾 -沁 -沂 -沃 -沄 -沅 -沆 -沇 -沈 -沉 -沌 -沍 -沏 -沐 -沒 -沓 -沔 -沖 -沘 -沙 -沚 -沛 -沜 -沢 -沨 -沫 -沭 -沮 -沯 -沱 -河 -沸 -油 -沺 -治 -沼 -沽 -沾 -沿 -況 -泂 -泄 -泆 -泇 -泉 -泊 -泌 -泐 -泓 -泔 -法 -泖 -泗 -泚 -泛 -泠 -泡 -波 -泣 -泥 -泩 -泫 -泮 -泯 -泰 -泱 -泳 -泵 -洄 -洋 -洌 -洎 -洗 -洙 -洛 -洞 -洢 -洣 -洤 -津 -洨 -洩 -洪 -洮 -洱 -洲 -洳 -洵 -洸 -洹 -洺 -活 -洽 -派 -流 -浄 -浙 -浚 -浛 -浜 -浞 -浟 -浠 -浡 -浣 -浤 -浥 -浦 -浩 -浪 -浮 -浯 -浴 -浵 -海 -浸 -浹 -涅 -涇 -消 -涉 -涌 -涎 -涑 -涓 -涔 -涕 -涙 -涪 -涫 -涮 -涯 -液 -涵 -涸 -涼 -涿 -淄 -淅 -淆 -淇 -淋 -淌 -淍 -淎 -淏 -淑 -淓 -淖 -淘 -淙 -淚 -淛 -淝 -淞 -淠 -淡 -淤 -淥 -淦 -淨 -淩 -淪 -淫 -淬 -淮 -淯 -淰 -深 -淳 -淵 -淶 -混 -淸 -淹 -淺 -添 -淼 -淽 -渃 -清 -済 -渉 -渋 -渕 -渙 -渚 -減 -渝 -渟 -渠 -渡 -渣 -渤 -渥 -渦 -渫 -測 -渭 -港 -渲 -渴 -游 -渺 -渼 -渽 -渾 -湃 -湄 -湉 -湊 -湍 -湓 -湔 -湖 -湘 -湛 -湜 -湞 -湟 -湣 -湥 -湧 -湫 -湮 -湯 -湳 -湴 -湼 -満 -溁 -溇 -溈 -溉 -溋 -溎 -溏 -源 -準 -溙 -溜 -溝 -溟 -溢 -溥 -溦 -溧 -溪 -溫 -溯 -溱 -溲 -溴 -溵 -溶 -溺 -溼 -滀 -滁 -滂 -滄 -滅 -滇 -滈 -滉 -滋 -滌 -滎 -滏 -滑 -滓 -滔 -滕 -滘 -滙 -滝 -滬 -滯 -滲 -滴 -滷 -滸 -滹 -滻 -滽 -滾 -滿 -漁 -漂 -漆 -漇 -漈 -漎 -漏 -漓 -演 -漕 -漚 -漠 -漢 -漣 -漩 -漪 -漫 -漬 -漯 -漱 -漲 -漳 -漴 -漵 -漷 -漸 -漼 -漾 -漿 -潁 -潑 -潔 -潘 -潛 -潞 -潟 -潢 -潤 -潭 -潮 -潯 -潰 -潲 -潺 -潼 -潽 -潾 -潿 -澀 -澁 -澂 -澄 -澆 -澇 -澈 -澉 -澋 -澌 -澍 -澎 -澔 -澗 -澠 -澡 -澣 -澤 -澥 -澧 -澪 -澮 -澯 -澱 -澳 -澶 -澹 -澻 -激 -濁 -濂 -濃 -濉 -濊 -濋 -濕 -濘 -濙 -濛 -濞 -濟 -濠 -濡 -濤 -濫 -濬 -濮 -濯 -濰 -濱 -濲 -濶 -濺 -濼 -濾 -瀁 -瀅 -瀆 -瀉 -瀍 -瀏 -瀑 -瀔 -瀕 -瀘 -瀚 -瀛 -瀝 -瀞 -瀟 -瀠 -瀣 -瀦 -瀧 -瀨 -瀬 -瀰 -瀲 -瀴 -瀶 -瀹 -瀾 -灃 -灊 -灌 -灑 -灘 -灝 -灞 -灡 -灣 -灤 -灧 -火 -灰 -灴 -灸 -灼 -災 -炁 -炅 -炆 -炊 -炎 -炒 -炔 -炕 -炘 -炙 -炟 -炣 -炤 -炫 -炬 -炭 -炮 -炯 -炱 -炲 -炳 -炷 -炸 -為 -炻 -烈 -烉 -烊 -烋 -烏 -烒 -烔 -烘 -烙 -烜 -烝 -烤 -烯 -烱 -烴 -烷 -烹 -烺 -烽 -焃 -焄 -焉 -焊 -焌 -焓 -焗 -焙 -焚 -焜 -焞 -無 -焦 -焯 -焰 -焱 -焴 -然 -焻 -焼 -焿 -煇 -煉 -煊 -煌 -煎 -煐 -煒 -煔 -煕 -煖 -煙 -煚 -煜 -煞 -煠 -煤 -煥 -煦 -照 -煨 -煩 -煬 -煮 -煲 -煳 -煵 -煶 -煸 -煽 -熄 -熅 -熇 -熈 -熊 -熏 -熒 -熔 -熖 -熗 -熘 -熙 -熜 -熟 -熠 -熤 -熥 -熨 -熬 -熯 -熱 -熲 -熳 -熵 -熹 -熺 -熼 -熾 -熿 -燁 -燃 -燄 -燈 -燉 -燊 -燎 -燏 -燐 -燒 -燔 -燕 -燘 -燙 -燚 -燜 -燝 -營 -燥 -燦 -燧 -燫 -燬 -燭 -燮 -燴 -燹 -燻 -燼 -燾 -燿 -爀 -爆 -爌 -爍 -爐 -爔 -爚 -爛 -爝 -爨 -爪 -爬 -爭 -爯 -爰 -爲 -爵 -父 -爸 -爹 -爺 -爻 -爽 -爾 -爿 -牁 -牂 -牆 -片 -版 -牌 -牒 -牕 -牖 -牘 -牙 -牛 -牝 -牟 -牠 -牡 -牢 -牧 -物 -牯 -牲 -特 -牻 -牼 -牽 -犀 -犁 -犂 -犇 -犍 -犎 -犖 -犛 -犢 -犧 -犨 -犬 -犯 -犰 -犴 -犽 -狀 -狂 -狄 -狍 -狎 -狐 -狒 -狓 -狗 -狙 -狛 -狟 -狠 -狡 -狦 -狨 -狩 -狳 -狶 -狷 -狸 -狹 -狻 -狼 -猁 -猄 -猇 -猊 -猗 -猙 -猛 -猜 -猝 -猞 -猢 -猥 -猨 -猩 -猳 -猴 -猶 -猷 -猺 -猻 -猾 -猿 -獁 -獃 -獄 -獅 -獇 -獎 -獏 -獐 -獒 -獠 -獢 -獣 -獨 -獬 -獮 -獯 -獰 -獲 -獴 -獵 -獷 -獸 -獺 -獻 -獼 -獾 -玀 -玄 -玆 -率 -玉 -王 -玎 -玏 -玓 -玕 -玖 -玗 -玘 -玙 -玟 -玠 -玡 -玢 -玥 -玧 -玨 -玩 -玫 -玭 -玲 -玳 -玶 -玷 -玹 -玻 -玾 -珀 -珂 -珅 -珈 -珉 -珊 -珌 -珍 -珎 -珏 -珖 -珙 -珝 -珞 -珠 -珡 -珣 -珤 -珥 -珦 -珧 -珩 -珪 -班 -珮 -珵 -珹 -珺 -珽 -現 -琁 -球 -琄 -琅 -理 -琇 -琉 -琊 -琍 -琎 -琚 -琛 -琡 -琢 -琤 -琥 -琦 -琨 -琪 -琬 -琮 -琯 -琰 -琱 -琳 -琴 -琵 -琶 -琹 -琺 -琿 -瑀 -瑁 -瑂 -瑄 -瑅 -瑆 -瑈 -瑊 -瑋 -瑑 -瑒 -瑕 -瑗 -瑙 -瑚 -瑛 -瑜 -瑝 -瑞 -瑟 -瑠 -瑢 -瑣 -瑤 -瑥 -瑧 -瑨 -瑩 -瑪 -瑭 -瑯 -瑰 -瑱 -瑳 -瑴 -瑺 -瑾 -璀 -璁 -璃 -璄 -璆 -璇 -璈 -璉 -璋 -璌 -璐 -璕 -璘 -璙 -璚 -璜 -璞 -璟 -璠 -璡 -璣 -璥 -璦 -璧 -璨 -璩 -璪 -璫 -璬 -璮 -環 -璱 -璵 -璸 -璹 -璽 -璿 -瓈 -瓊 -瓌 -瓏 -瓑 -瓔 -瓖 -瓘 -瓚 -瓛 -瓜 -瓞 -瓠 -瓢 -瓣 -瓤 -瓦 -瓮 -瓴 -瓶 -瓷 -瓿 -甂 -甄 -甌 -甍 -甑 -甕 -甘 -甙 -甚 -甜 -生 -甡 -產 -産 -甥 -甦 -用 -甩 -甪 -甫 -甬 -甯 -田 -由 -甲 -申 -男 -甸 -甹 -町 -甾 -畀 -畇 -畈 -畊 -畋 -界 -畎 -畏 -畐 -畑 -畔 -留 -畜 -畝 -畠 -畢 -略 -畦 -畧 -番 -畫 -畬 -畯 -異 -畲 -畳 -畵 -當 -畷 -畸 -畹 -畿 -疃 -疆 -疇 -疊 -疋 -疌 -疍 -疏 -疑 -疒 -疕 -疙 -疚 -疝 -疣 -疤 -疥 -疫 -疲 -疳 -疵 -疸 -疹 -疼 -疽 -疾 -痂 -病 -症 -痊 -痍 -痔 -痕 -痘 -痙 -痛 -痞 -痟 -痠 -痢 -痣 -痤 -痧 -痩 -痰 -痱 -痲 -痴 -痹 -痺 -痿 -瘀 -瘁 -瘊 -瘋 -瘍 -瘓 -瘙 -瘜 -瘞 -瘟 -瘠 -瘡 -瘢 -瘤 -瘦 -瘧 -瘩 -瘰 -瘴 -瘺 -癀 -療 -癆 -癇 -癌 -癒 -癖 -癘 -癜 -癟 -癡 -癢 -癤 -癥 -癩 -癬 -癭 -癮 -癯 -癰 -癱 -癲 -癸 -発 -登 -發 -白 -百 -皂 -的 -皆 -皇 -皈 -皋 -皎 -皐 -皓 -皖 -皙 -皚 -皛 -皝 -皞 -皮 -皰 -皴 -皷 -皸 -皺 -皿 -盂 -盃 -盅 -盆 -盈 -益 -盋 -盌 -盎 -盒 -盔 -盛 -盜 -盞 -盟 -盡 -監 -盤 -盥 -盦 -盧 -盨 -盩 -盪 -盫 -目 -盯 -盱 -盲 -直 -盷 -相 -盹 -盺 -盼 -盾 -眀 -省 -眉 -看 -県 -眙 -眛 -眜 -眞 -真 -眠 -眥 -眨 -眩 -眭 -眯 -眵 -眶 -眷 -眸 -眺 -眼 -眾 -着 -睇 -睛 -睜 -睞 -睡 -睢 -督 -睥 -睦 -睨 -睪 -睫 -睭 -睹 -睺 -睽 -睾 -睿 -瞄 -瞅 -瞋 -瞌 -瞎 -瞑 -瞓 -瞞 -瞢 -瞥 -瞧 -瞪 -瞫 -瞬 -瞭 -瞰 -瞳 -瞻 -瞼 -瞽 -瞿 -矇 -矍 -矗 -矚 -矛 -矜 -矞 -矢 -矣 -知 -矧 -矩 -短 -矮 -矯 -石 -矸 -矽 -砂 -砋 -砌 -砍 -砒 -研 -砝 -砢 -砥 -砦 -砧 -砩 -砫 -砭 -砮 -砯 -砰 -砲 -砳 -破 -砵 -砷 -砸 -砼 -硂 -硃 -硅 -硇 -硏 -硐 -硒 -硓 -硚 -硜 -硝 -硤 -硨 -硫 -硬 -硭 -硯 -硼 -碁 -碇 -碉 -碌 -碎 -碑 -碓 -碕 -碗 -碘 -碚 -碟 -碡 -碣 -碧 -碩 -碪 -碭 -碰 -碲 -碳 -碴 -碶 -碸 -確 -碻 -碼 -碽 -碾 -磁 -磅 -磊 -磋 -磐 -磔 -磕 -磘 -磙 -磚 -磜 -磡 -磨 -磪 -磬 -磯 -磱 -磲 -磵 -磷 -磺 -磻 -磾 -礁 -礄 -礎 -礐 -礑 -礒 -礙 -礠 -礦 -礪 -礫 -礬 -礮 -礱 -礴 -示 -礻 -礽 -社 -祀 -祁 -祂 -祆 -祇 -祈 -祉 -祋 -祏 -祐 -祓 -祕 -祖 -祗 -祙 -祚 -祛 -祜 -祝 -神 -祟 -祠 -祥 -祧 -票 -祭 -祹 -祺 -祼 -祿 -禁 -禃 -禇 -禍 -禎 -福 -禑 -禓 -禔 -禕 -禘 -禛 -禟 -禠 -禤 -禦 -禧 -禨 -禩 -禪 -禮 -禰 -禱 -禵 -禹 -禺 -禼 -禽 -禾 -禿 -秀 -私 -秈 -秉 -秋 -科 -秒 -秕 -秘 -租 -秠 -秣 -秤 -秦 -秧 -秩 -秭 -秳 -秸 -移 -稀 -稅 -稈 -稉 -程 -稍 -稑 -稔 -稗 -稘 -稙 -稚 -稜 -稞 -稟 -稠 -種 -稱 -稲 -稷 -稹 -稺 -稻 -稼 -稽 -稾 -稿 -穀 -穂 -穆 -穈 -穉 -穌 -積 -穎 -穗 -穟 -穠 -穡 -穢 -穣 -穩 -穫 -穰 -穴 -穵 -究 -穹 -空 -穿 -突 -窄 -窅 -窈 -窋 -窒 -窕 -窖 -窗 -窘 -窟 -窠 -窣 -窨 -窩 -窪 -窮 -窯 -窰 -窶 -窺 -窿 -竄 -竅 -竇 -竈 -竊 -立 -竑 -站 -竜 -竟 -章 -竣 -童 -竦 -竩 -竭 -端 -競 -竹 -竺 -竻 -竿 -笄 -笆 -笈 -笏 -笑 -笘 -笙 -笛 -笞 -笠 -笥 -符 -笨 -笩 -笪 -第 -笭 -笮 -笯 -笱 -笳 -笹 -筅 -筆 -等 -筊 -筋 -筌 -筍 -筏 -筐 -筒 -答 -策 -筘 -筠 -筥 -筦 -筧 -筬 -筭 -筱 -筲 -筳 -筵 -筶 -筷 -筻 -箆 -箇 -箋 -箍 -箏 -箐 -箑 -箒 -箔 -箕 -算 -箜 -管 -箬 -箭 -箱 -箴 -箸 -節 -篁 -範 -篆 -篇 -築 -篊 -篋 -篌 -篔 -篙 -篝 -篠 -篡 -篤 -篥 -篦 -篩 -篪 -篭 -篯 -篳 -篷 -簀 -簃 -簇 -簉 -簋 -簍 -簑 -簕 -簗 -簞 -簠 -簡 -簧 -簪 -簫 -簷 -簸 -簹 -簺 -簽 -簾 -簿 -籀 -籃 -籌 -籍 -籐 -籙 -籛 -籜 -籝 -籟 -籠 -籣 -籤 -籥 -籪 -籬 -籮 -籲 -米 -籽 -籾 -粄 -粉 -粍 -粑 -粒 -粕 -粗 -粘 -粟 -粢 -粥 -粦 -粧 -粩 -粱 -粲 -粳 -粵 -粹 -粼 -粽 -精 -粿 -糀 -糅 -糊 -糌 -糍 -糎 -糕 -糖 -糙 -糜 -糝 -糞 -糟 -糠 -糢 -糧 -糬 -糯 -糰 -糴 -糶 -糸 -糹 -糺 -系 -糾 -紀 -紂 -約 -紅 -紆 -紇 -紈 -紉 -紊 -紋 -納 -紐 -紑 -紓 -純 -紕 -紗 -紘 -紙 -級 -紛 -紜 -紝 -紞 -素 -紡 -索 -紫 -紮 -累 -細 -紱 -紲 -紳 -紵 -紹 -紺 -紿 -終 -絃 -組 -絆 -経 -絎 -結 -絕 -絛 -絜 -絞 -絡 -絢 -給 -絨 -絪 -絮 -統 -絲 -絳 -絵 -絶 -絹 -絺 -綁 -綃 -綈 -綉 -綎 -綏 -經 -綖 -継 -続 -綜 -綝 -綞 -綠 -綢 -綣 -綦 -綧 -綫 -綬 -維 -綮 -綰 -綱 -網 -綳 -綴 -綸 -綺 -綻 -綽 -綾 -綿 -緁 -緃 -緄 -緈 -緊 -緋 -総 -緑 -緒 -緖 -緘 -線 -緜 -緝 -緞 -締 -緡 -緣 -緤 -編 -緩 -緬 -緯 -緱 -緲 -練 -緹 -緻 -縂 -縄 -縈 -縉 -縊 -縕 -縛 -縝 -縞 -縠 -縡 -縣 -縤 -縫 -縮 -縯 -縱 -縴 -縵 -縷 -縹 -縻 -總 -績 -繁 -繃 -繆 -繇 -繒 -織 -繕 -繖 -繙 -繚 -繞 -繡 -繩 -繪 -繫 -繭 -繰 -繳 -繹 -繻 -繼 -繽 -繾 -纁 -纂 -纈 -續 -纍 -纏 -纓 -纔 -纕 -纖 -纘 -纛 -纜 -缐 -缶 -缸 -缺 -缽 -罃 -罄 -罅 -罈 -罉 -罌 -罍 -罐 -罔 -罕 -罘 -罟 -罡 -罨 -罩 -罪 -置 -罰 -罱 -署 -罳 -罵 -罶 -罷 -罹 -罽 -羂 -羅 -羆 -羈 -羊 -羋 -羌 -美 -羔 -羕 -羗 -羙 -羚 -羞 -羡 -羣 -群 -羥 -羧 -羨 -義 -羯 -羰 -羱 -羲 -羸 -羹 -羽 -羿 -翀 -翁 -翂 -翃 -翅 -翊 -翌 -翎 -翏 -習 -翔 -翕 -翙 -翜 -翟 -翠 -翡 -翥 -翦 -翩 -翬 -翮 -翰 -翱 -翳 -翹 -翻 -翼 -耀 -老 -考 -耄 -者 -耆 -而 -耍 -耎 -耐 -耑 -耒 -耔 -耕 -耗 -耘 -耙 -耜 -耦 -耨 -耬 -耳 -耵 -耶 -耷 -耽 -耿 -聃 -聆 -聊 -聒 -聖 -聘 -聚 -聞 -聟 -聨 -聯 -聰 -聱 -聲 -聳 -聴 -聶 -職 -聽 -聾 -聿 -肄 -肅 -肆 -肇 -肉 -肋 -肌 -肏 -肖 -肘 -肚 -肛 -肜 -肝 -肟 -股 -肢 -肥 -肩 -肪 -肫 -肯 -肱 -育 -肸 -肹 -肺 -肼 -肽 -胂 -胃 -胄 -胅 -胇 -胊 -背 -胍 -胎 -胖 -胗 -胙 -胚 -胛 -胝 -胞 -胡 -胤 -胥 -胬 -胭 -胰 -胱 -胳 -胴 -胸 -胺 -胼 -能 -脂 -脅 -脆 -脇 -脈 -脊 -脒 -脖 -脘 -脛 -脣 -脩 -脫 -脬 -脭 -脯 -脲 -脳 -脷 -脹 -脾 -腆 -腈 -腊 -腋 -腌 -腎 -腐 -腑 -腓 -腔 -腕 -腥 -腦 -腧 -腩 -腫 -腮 -腰 -腱 -腳 -腴 -腸 -腹 -腺 -腿 -膀 -膂 -膈 -膊 -膏 -膚 -膛 -膜 -膝 -膠 -膣 -膥 -膦 -膨 -膩 -膮 -膳 -膺 -膽 -膾 -膿 -臀 -臂 -臃 -臆 -臉 -臊 -臍 -臏 -臘 -臚 -臞 -臟 -臠 -臣 -臧 -臨 -自 -臭 -臯 -至 -致 -臺 -臻 -臼 -臾 -舂 -舅 -與 -興 -舉 -舊 -舌 -舍 -舎 -舒 -舔 -舖 -舘 -舛 -舜 -舞 -舟 -舢 -舥 -舨 -舩 -航 -舫 -般 -舲 -舵 -舶 -舷 -舸 -船 -舺 -艅 -艇 -艉 -艋 -艎 -艏 -艔 -艘 -艙 -艚 -艦 -艮 -良 -艱 -色 -艶 -艷 -艸 -艽 -艾 -艿 -芃 -芊 -芋 -芍 -芎 -芑 -芒 -芘 -芙 -芛 -芝 -芡 -芥 -芨 -芩 -芪 -芫 -芬 -芭 -芮 -芯 -花 -芳 -芴 -芷 -芸 -芹 -芻 -芽 -芾 -苄 -苅 -苑 -苒 -苓 -苔 -苕 -苗 -苛 -苜 -苝 -苞 -苟 -苡 -苣 -苤 -若 -苦 -苧 -苪 -苫 -苯 -英 -苳 -苴 -苷 -苺 -苻 -苼 -苾 -茀 -茁 -茂 -范 -茄 -茅 -茆 -茇 -茈 -茉 -茌 -茗 -茘 -茚 -茛 -茜 -茝 -茨 -茫 -茬 -茭 -茮 -茯 -茱 -茲 -茴 -茵 -茶 -茷 -茸 -茹 -茺 -茼 -荀 -荃 -荅 -荇 -草 -荊 -荎 -荏 -荒 -荔 -荖 -荘 -荳 -荷 -荸 -荻 -荼 -荽 -莆 -莉 -莊 -莎 -莒 -莓 -莕 -莖 -莘 -莙 -莛 -莜 -莞 -莠 -莢 -莧 -莨 -莩 -莪 -莫 -莽 -莿 -菀 -菁 -菅 -菇 -菈 -菉 -菊 -菌 -菍 -菏 -菑 -菓 -菔 -菖 -菘 -菜 -菝 -菟 -菠 -菡 -菥 -菩 -菪 -菫 -華 -菰 -菱 -菲 -菴 -菶 -菸 -菹 -菺 -菼 -菽 -菾 -萁 -萃 -萄 -萇 -萊 -萌 -萍 -萎 -萐 -萘 -萜 -萠 -萡 -萣 -萩 -萬 -萭 -萱 -萵 -萸 -萹 -萼 -落 -葃 -葆 -葉 -葊 -葎 -葑 -葒 -著 -葙 -葚 -葛 -葜 -葝 -葡 -董 -葦 -葩 -葫 -葬 -葭 -葯 -葰 -葳 -葵 -葶 -葷 -葺 -蒂 -蒄 -蒍 -蒎 -蒐 -蒓 -蒔 -蒗 -蒙 -蒜 -蒞 -蒟 -蒡 -蒢 -蒤 -蒧 -蒨 -蒭 -蒯 -蒲 -蒴 -蒸 -蒹 -蒺 -蒻 -蒼 -蒽 -蒾 -蒿 -蓀 -蓁 -蓂 -蓄 -蓆 -蓉 -蓋 -蓍 -蓑 -蓓 -蓖 -蓘 -蓚 -蓧 -蓨 -蓪 -蓬 -蓭 -蓮 -蓯 -蓳 -蓼 -蓽 -蓿 -蔆 -蔎 -蔑 -蔓 -蔔 -蔕 -蔗 -蔘 -蔚 -蔝 -蔞 -蔡 -蔣 -蔥 -蔦 -蔬 -蔭 -蔴 -蔵 -蔻 -蔽 -蕁 -蕃 -蕅 -蕈 -蕉 -蕊 -蕎 -蕑 -蕒 -蕖 -蕘 -蕙 -蕚 -蕟 -蕡 -蕢 -蕤 -蕨 -蕩 -蕪 -蕭 -蕷 -蕹 -蕺 -蕻 -蕾 -薀 -薄 -薆 -薇 -薈 -薊 -薌 -薏 -薐 -薑 -薔 -薗 -薘 -薙 -薛 -薜 -薞 -薟 -薡 -薦 -薨 -薩 -薪 -薫 -薬 -薯 -薰 -薲 -薷 -薸 -薹 -薺 -薾 -薿 -藁 -藉 -藍 -藎 -藏 -藐 -藔 -藕 -藜 -藝 -藟 -藤 -藥 -藦 -藨 -藩 -藪 -藶 -藸 -藹 -藺 -藻 -藿 -蘂 -蘄 -蘅 -蘆 -蘇 -蘊 -蘋 -蘐 -蘑 -蘓 -蘗 -蘘 -蘚 -蘞 -蘢 -蘧 -蘩 -蘭 -蘵 -蘶 -蘸 -蘼 -蘿 -虉 -虎 -虐 -虓 -虔 -處 -虖 -虛 -虜 -虞 -號 -虢 -虧 -虨 -虯 -虱 -虵 -虹 -虺 -虻 -蚆 -蚊 -蚋 -蚌 -蚍 -蚓 -蚖 -蚜 -蚝 -蚡 -蚢 -蚣 -蚤 -蚧 -蚨 -蚩 -蚪 -蚯 -蚱 -蚴 -蚵 -蚶 -蚺 -蚼 -蛀 -蛄 -蛇 -蛉 -蛋 -蛍 -蛐 -蛑 -蛔 -蛙 -蛛 -蛞 -蛟 -蛤 -蛭 -蛯 -蛸 -蛹 -蛺 -蛻 -蛾 -蜀 -蜂 -蜃 -蜆 -蜇 -蜈 -蜉 -蜊 -蜍 -蜑 -蜒 -蜓 -蜘 -蜚 -蜛 -蜜 -蜞 -蜢 -蜣 -蜥 -蜨 -蜮 -蜯 -蜱 -蜴 -蜷 -蜻 -蜾 -蜿 -蝀 -蝌 -蝍 -蝎 -蝓 -蝕 -蝗 -蝘 -蝙 -蝚 -蝟 -蝠 -蝣 -蝤 -蝦 -蝨 -蝮 -蝯 -蝰 -蝲 -蝴 -蝶 -蝸 -蝽 -螂 -螃 -螄 -螅 -螈 -螋 -融 -螐 -螔 -螞 -螟 -螠 -螢 -螣 -螥 -螫 -螭 -螯 -螳 -螶 -螺 -螻 -螽 -螾 -蟀 -蟄 -蟅 -蟆 -蟊 -蟋 -蟌 -蟎 -蟑 -蟒 -蟜 -蟠 -蟥 -蟪 -蟫 -蟬 -蟯 -蟲 -蟳 -蟴 -蟶 -蟹 -蟻 -蟾 -蠂 -蠃 -蠄 -蠅 -蠆 -蠊 -蠋 -蠍 -蠐 -蠑 -蠓 -蠔 -蠕 -蠖 -蠘 -蠙 -蠟 -蠡 -蠢 -蠣 -蠱 -蠲 -蠵 -蠶 -蠷 -蠹 -蠻 -血 -衂 -衆 -行 -衍 -衎 -術 -衕 -衖 -街 -衙 -衚 -衛 -衜 -衝 -衞 -衡 -衢 -衣 -表 -衩 -衫 -衰 -衲 -衷 -衽 -衾 -衿 -袁 -袂 -袈 -袋 -袍 -袓 -袖 -袛 -袞 -袤 -袪 -被 -袱 -袴 -袾 -裁 -裂 -裊 -裎 -裒 -裔 -裕 -裖 -裘 -裙 -補 -裝 -裟 -裡 -裨 -裬 -裱 -裳 -裴 -裵 -裸 -裹 -製 -裾 -裿 -褀 -褂 -複 -褌 -褍 -褎 -褐 -褒 -褓 -褔 -褘 -褙 -褚 -褞 -褥 -褧 -褪 -褫 -褭 -褲 -褶 -褸 -褻 -襄 -襌 -襖 -襞 -襟 -襠 -襤 -襦 -襪 -襯 -襲 -襴 -襶 -襻 -襾 -西 -要 -覃 -覆 -覇 -覈 -見 -覌 -規 -覓 -視 -覚 -覡 -覦 -覧 -親 -覬 -覲 -観 -覺 -覽 -覿 -觀 -角 -觔 -觙 -觚 -觜 -解 -觭 -觱 -觴 -觶 -觸 -觿 -言 -訁 -訂 -訃 -訇 -計 -訊 -訌 -討 -訏 -訐 -訒 -訓 -訔 -訕 -訖 -託 -記 -訛 -訝 -訟 -訣 -訥 -訪 -設 -許 -訴 -訶 -診 -註 -証 -訾 -詁 -詆 -詈 -詐 -詒 -詔 -評 -詛 -詞 -詠 -詡 -詢 -詣 -詥 -試 -詧 -詩 -詫 -詭 -詮 -詰 -話 -該 -詳 -詵 -詹 -詼 -誄 -誅 -誇 -誌 -認 -誒 -誓 -誕 -誘 -語 -誠 -誡 -誣 -誤 -誥 -誦 -誨 -說 -説 -読 -誰 -課 -誴 -誹 -誼 -誾 -調 -談 -請 -諍 -諏 -諒 -論 -諗 -諜 -諟 -諠 -諡 -諤 -諦 -諧 -諪 -諫 -諭 -諮 -諱 -諲 -諳 -諴 -諶 -諷 -諸 -諺 -諼 -諾 -謀 -謁 -謂 -謄 -謇 -謊 -謌 -謎 -謏 -謐 -謔 -謖 -謗 -謙 -謚 -講 -謜 -謝 -謠 -謢 -謤 -謨 -謩 -謫 -謬 -謳 -謹 -謾 -證 -譏 -譓 -譔 -識 -譙 -譚 -譜 -譞 -警 -譫 -譬 -譭 -譯 -議 -譲 -譳 -譴 -護 -譽 -譿 -讀 -讃 -變 -讌 -讎 -讓 -讖 -讙 -讚 -讜 -讞 -谷 -谿 -豁 -豆 -豇 -豈 -豉 -豊 -豌 -豎 -豐 -豔 -豕 -豚 -象 -豢 -豨 -豪 -豫 -豬 -豳 -豸 -豹 -豺 -豿 -貂 -貅 -貉 -貊 -貌 -貐 -貒 -貓 -貔 -貘 -貝 -貞 -負 -財 -貢 -貤 -貧 -貨 -販 -貪 -貫 -責 -貭 -貮 -貯 -貲 -貳 -貴 -貶 -買 -貸 -貺 -費 -貼 -貽 -貿 -賀 -賁 -賂 -賃 -賄 -資 -賈 -賊 -賑 -賒 -賓 -賔 -賕 -賚 -賜 -賞 -賠 -賡 -賢 -賣 -賤 -賦 -賨 -質 -賬 -賭 -賴 -賹 -賺 -賻 -購 -賽 -賾 -贄 -贅 -贇 -贈 -贊 -贌 -贍 -贏 -贓 -贔 -贖 -贛 -赤 -赦 -赧 -赫 -赬 -赭 -走 -赳 -赴 -起 -趁 -超 -越 -趐 -趕 -趖 -趙 -趟 -趣 -趨 -足 -趴 -趵 -趺 -趼 -趾 -跅 -跆 -跋 -跌 -跏 -跑 -跖 -跗 -跛 -距 -跟 -跡 -跣 -跤 -跨 -跩 -跪 -路 -跳 -踎 -踏 -踐 -踝 -踞 -踢 -踩 -踰 -踴 -踹 -踺 -蹂 -蹄 -蹇 -蹈 -蹉 -蹊 -蹋 -蹕 -蹙 -蹟 -蹠 -蹤 -蹦 -蹬 -蹭 -蹯 -蹲 -蹴 -蹶 -蹺 -蹻 -蹼 -躁 -躂 -躄 -躉 -躋 -躍 -躑 -躒 -躔 -躝 -躪 -身 -躬 -躰 -躲 -躺 -軀 -車 -軋 -軌 -軍 -軎 -軒 -軔 -軛 -軟 -転 -軫 -軲 -軸 -軹 -軺 -軻 -軼 -軽 -軾 -較 -輄 -輅 -載 -輋 -輒 -輓 -輔 -輕 -輛 -輝 -輞 -輟 -輥 -輦 -輩 -輪 -輬 -輭 -輯 -輶 -輸 -輻 -輾 -輿 -轀 -轂 -轄 -轅 -轆 -轉 -轍 -轎 -轘 -轝 -轟 -轤 -辛 -辜 -辟 -辣 -辦 -辧 -辨 -辭 -辮 -辯 -辰 -辱 -農 -辵 -辺 -辻 -込 -迂 -迄 -迅 -迎 -近 -返 -迢 -迤 -迥 -迦 -迪 -迫 -迭 -迮 -述 -迴 -迵 -迷 -迸 -迺 -追 -退 -送 -逃 -逄 -逅 -逆 -逈 -逋 -逌 -逍 -逎 -透 -逐 -逑 -途 -逕 -逖 -逗 -這 -通 -逛 -逝 -逞 -速 -造 -逢 -連 -逤 -逨 -逮 -逯 -進 -逴 -逵 -逸 -逹 -逺 -逼 -逾 -遁 -遂 -遄 -遇 -遊 -運 -遍 -過 -遏 -遐 -遒 -道 -達 -違 -遘 -遙 -遛 -遜 -遞 -遠 -遢 -遣 -遨 -適 -遭 -遮 -遯 -遲 -遴 -遵 -遶 -遷 -選 -遹 -遺 -遼 -避 -邀 -邁 -邂 -邃 -還 -邇 -邈 -邉 -邊 -邋 -邏 -邑 -邕 -邗 -邙 -邛 -邠 -邡 -邢 -那 -邦 -邨 -邪 -邯 -邰 -邱 -邲 -邳 -邴 -邵 -邸 -邽 -邾 -郁 -郃 -郄 -郅 -郇 -郊 -郋 -郎 -郗 -郛 -郜 -郝 -郞 -郟 -郡 -郢 -郤 -部 -郪 -郫 -郭 -郯 -郳 -郴 -郵 -郷 -都 -郾 -郿 -鄂 -鄃 -鄄 -鄆 -鄉 -鄋 -鄑 -鄒 -鄔 -鄖 -鄗 -鄘 -鄙 -鄚 -鄜 -鄞 -鄠 -鄢 -鄣 -鄤 -鄧 -鄩 -鄫 -鄭 -鄯 -鄰 -鄱 -鄲 -鄳 -鄴 -鄺 -酃 -酆 -酈 -酉 -酊 -酋 -酌 -配 -酎 -酏 -酐 -酒 -酔 -酗 -酚 -酞 -酡 -酢 -酣 -酥 -酩 -酪 -酬 -酮 -酯 -酰 -酴 -酵 -酶 -酷 -酸 -酺 -酼 -醁 -醂 -醃 -醅 -醇 -醉 -醋 -醌 -醍 -醐 -醒 -醚 -醛 -醜 -醞 -醢 -醣 -醪 -醫 -醬 -醮 -醯 -醴 -醺 -醾 -醿 -釀 -釁 -釆 -采 -釉 -釋 -里 -重 -野 -量 -釐 -金 -釒 -釓 -釔 -釕 -釗 -釘 -釙 -釚 -釜 -針 -釣 -釤 -釦 -釧 -釩 -釪 -釭 -釴 -釵 -釷 -釹 -釺 -鈀 -鈁 -鈄 -鈇 -鈈 -鈉 -鈊 -鈍 -鈏 -鈐 -鈑 -鈔 -鈕 -鈖 -鈞 -鈢 -鈣 -鈥 -鈦 -鈫 -鈮 -鈰 -鈳 -鈴 -鈷 -鈸 -鈹 -鈺 -鈾 -鈿 -鉀 -鉄 -鉅 -鉆 -鉈 -鉉 -鉋 -鉌 -鉍 -鉏 -鉑 -鉓 -鉗 -鉚 -鉛 -鉞 -鉟 -鉤 -鉦 -鉬 -鉭 -鉲 -鉶 -鉷 -鉸 -鉻 -鉾 -鉿 -銀 -銂 -銃 -銅 -銋 -銍 -銑 -銓 -銕 -銖 -銘 -銚 -銜 -銠 -銣 -銥 -銦 -銨 -銩 -銪 -銫 -銬 -銭 -銱 -銲 -銳 -銶 -銷 -銹 -銻 -銼 -銾 -鋁 -鋅 -鋆 -鋇 -鋌 -鋏 -鋐 -鋒 -鋕 -鋗 -鋙 -鋡 -鋤 -鋥 -鋦 -鋨 -鋪 -鋮 -鋯 -鋰 -鋱 -鋳 -鋶 -鋸 -鋹 -鋼 -錀 -錄 -錏 -錐 -錒 -錕 -錘 -錚 -錞 -錟 -錠 -錡 -錢 -錦 -錨 -錫 -錬 -錮 -錯 -錳 -錶 -錸 -錻 -鍀 -鍇 -鍈 -鍉 -鍊 -鍋 -鍍 -鍏 -鍔 -鍘 -鍛 -鍝 -鍟 -鍠 -鍥 -鍩 -鍬 -鍱 -鍳 -鍵 -鍶 -鍷 -鍺 -鍼 -鍾 -鎂 -鎅 -鎊 -鎌 -鎏 -鎓 -鎔 -鎖 -鎗 -鎘 -鎚 -鎛 -鎢 -鎣 -鎦 -鎧 -鎪 -鎬 -鎭 -鎮 -鎰 -鎳 -鎵 -鎻 -鏃 -鏇 -鏈 -鏊 -鏌 -鏐 -鏑 -鏓 -鏖 -鏗 -鏘 -鏜 -鏝 -鏞 -鏟 -鏡 -鏢 -鏤 -鏦 -鏳 -鏴 -鏵 -鏷 -鏻 -鏽 -鐃 -鐇 -鐈 -鐓 -鐔 -鐘 -鐙 -鐠 -鐡 -鐤 -鐦 -鐧 -鐫 -鐬 -鐭 -鐮 -鐲 -鐳 -鐵 -鐸 -鐺 -鐽 -鐿 -鑀 -鑁 -鑂 -鑄 -鑅 -鑊 -鑌 -鑑 -鑒 -鑛 -鑠 -鑣 -鑨 -鑪 -鑫 -鑭 -鑰 -鑲 -鑴 -鑷 -鑼 -鑽 -鑾 -鑿 -長 -門 -閂 -閃 -閆 -閉 -開 -閎 -閏 -閑 -閒 -間 -閔 -閘 -閜 -閞 -閟 -関 -閣 -閥 -閦 -閨 -閩 -閬 -閭 -閰 -閱 -閶 -閹 -閻 -閼 -閾 -閿 -闆 -闇 -闈 -闊 -闋 -闌 -闍 -闐 -闓 -闔 -闕 -闖 -闘 -關 -闞 -闡 -闢 -闥 -阜 -阝 -阡 -阪 -阭 -阮 -阯 -阱 -防 -阻 -阿 -陀 -陁 -陂 -附 -陋 -陌 -降 -限 -陔 -陘 -陛 -陜 -陝 -陞 -陟 -陡 -院 -陣 -除 -陪 -陬 -陰 -陲 -陳 -陵 -陶 -陷 -陸 -険 -陽 -隄 -隅 -隆 -隈 -隊 -隋 -隍 -階 -隔 -隕 -隗 -隘 -隙 -際 -障 -隣 -隧 -隨 -險 -隰 -隱 -隲 -隳 -隴 -隷 -隸 -隹 -隻 -隼 -雀 -雁 -雄 -雅 -集 -雇 -雉 -雋 -雌 -雍 -雎 -雑 -雒 -雕 -雖 -雙 -雛 -雜 -雝 -雞 -離 -難 -雨 -雩 -雪 -雫 -雯 -雱 -雲 -零 -雷 -雹 -電 -需 -霄 -霅 -霆 -震 -霈 -霉 -霊 -霍 -霎 -霏 -霑 -霓 -霖 -霙 -霜 -霞 -霤 -霧 -霨 -霰 -露 -霶 -霸 -霹 -霽 -霾 -靁 -靂 -靄 -靈 -靉 -靑 -青 -靖 -靚 -靛 -靜 -非 -靠 -靡 -面 -革 -靫 -靬 -靭 -靳 -靴 -靶 -靺 -靼 -鞅 -鞆 -鞋 -鞍 -鞏 -鞘 -鞞 -鞠 -鞣 -鞥 -鞦 -鞨 -鞭 -鞮 -鞴 -韁 -韃 -韆 -韋 -韌 -韑 -韓 -韙 -韜 -韞 -韠 -韡 -韭 -韮 -音 -韶 -韺 -韻 -韾 -響 -頁 -頂 -頃 -項 -順 -須 -頊 -頌 -頍 -頎 -頏 -預 -頑 -頒 -頓 -頔 -頗 -領 -頜 -頠 -頡 -頤 -頦 -頫 -頭 -頰 -頴 -頵 -頷 -頸 -頹 -頻 -頼 -顆 -題 -額 -顎 -顏 -顒 -顓 -顔 -顕 -顗 -願 -顙 -顛 -類 -顥 -顧 -顫 -顯 -顰 -顱 -顳 -顴 -風 -颮 -颯 -颱 -颶 -颺 -颼 -飄 -飆 -飈 -飛 -食 -飠 -飡 -飢 -飥 -飩 -飪 -飫 -飬 -飭 -飮 -飯 -飲 -飴 -飼 -飽 -飾 -餃 -餄 -餅 -餉 -養 -餌 -餎 -餐 -餒 -餓 -餗 -餘 -餚 -餛 -餞 -餠 -餡 -館 -餮 -餵 -餺 -餾 -餿 -饃 -饅 -饋 -饌 -饑 -饒 -饕 -饗 -饞 -饟 -饢 -首 -馗 -馘 -香 -馛 -馥 -馦 -馨 -馬 -馭 -馮 -馯 -馱 -馳 -馴 -馼 -駁 -駄 -駅 -駆 -駐 -駑 -駒 -駔 -駕 -駘 -駙 -駛 -駝 -駟 -駢 -駭 -駰 -駱 -駿 -騁 -騂 -騄 -騅 -騋 -騎 -騏 -験 -騖 -騙 -騤 -騨 -騫 -騭 -騮 -騰 -騶 -騷 -騾 -驁 -驃 -驄 -驅 -驊 -驌 -驍 -驎 -驒 -驕 -驗 -驚 -驛 -驟 -驢 -驤 -驥 -驩 -驪 -骨 -骯 -骰 -骶 -骷 -骸 -骼 -髀 -髂 -髎 -髏 -髑 -髒 -髓 -體 -高 -髙 -髡 -髦 -髪 -髭 -髮 -髯 -髲 -髷 -髹 -髻 -鬃 -鬄 -鬅 -鬆 -鬍 -鬚 -鬟 -鬢 -鬣 -鬥 -鬧 -鬨 -鬩 -鬪 -鬬 -鬮 -鬯 -鬱 -鬲 -鬹 -鬻 -鬼 -魁 -魂 -魃 -魄 -魅 -魈 -魋 -魍 -魎 -魏 -魔 -魕 -魘 -魚 -魛 -魞 -魟 -魣 -魨 -魩 -魮 -魯 -魴 -魷 -鮀 -鮁 -鮃 -鮄 -鮊 -鮋 -鮍 -鮐 -鮑 -鮒 -鮓 -鮗 -鮜 -鮟 -鮠 -鮡 -鮣 -鮨 -鮪 -鮫 -鮭 -鮮 -鮰 -鮸 -鮹 -鮻 -鯀 -鯁 -鯃 -鯇 -鯉 -鯊 -鯏 -鯒 -鯓 -鯔 -鯕 -鯖 -鯗 -鯙 -鯛 -鯡 -鯢 -鯤 -鯧 -鯨 -鯪 -鯭 -鯮 -鯰 -鯶 -鯷 -鯻 -鯽 -鯿 -鰂 -鰃 -鰆 -鰈 -鰉 -鰍 -鰏 -鰒 -鰓 -鰕 -鰗 -鰛 -鰜 -鰟 -鰣 -鰤 -鰧 -鰨 -鰩 -鰭 -鰮 -鰱 -鰲 -鰳 -鰶 -鰷 -鰹 -鰺 -鰻 -鰼 -鰾 -鱀 -鱂 -鱅 -鱇 -鱈 -鱉 -鱊 -鱒 -鱓 -鱔 -鱖 -鱗 -鱘 -鱚 -鱝 -鱟 -鱠 -鱣 -鱥 -鱧 -鱨 -鱬 -鱮 -鱰 -鱲 -鱵 -鱷 -鱸 -鱺 -鱻 -鳥 -鳧 -鳩 -鳯 -鳰 -鳳 -鳴 -鳶 -鳽 -鴆 -鴇 -鴉 -鴒 -鴓 -鴕 -鴗 -鴛 -鴝 -鴞 -鴟 -鴡 -鴣 -鴦 -鴨 -鴫 -鴯 -鴰 -鴴 -鴻 -鴿 -鵂 -鵄 -鵎 -鵐 -鵑 -鵒 -鵓 -鵙 -鵜 -鵝 -鵞 -鵟 -鵠 -鵡 -鵪 -鵬 -鵯 -鵰 -鵲 -鵵 -鵼 -鵾 -鶆 -鶇 -鶉 -鶏 -鶒 -鶓 -鶘 -鶚 -鶡 -鶥 -鶩 -鶬 -鶯 -鶲 -鶴 -鶹 -鶺 -鶻 -鶼 -鶿 -鷂 -鷄 -鷉 -鷎 -鷓 -鷗 -鷙 -鷚 -鷟 -鷥 -鷦 -鷫 -鷯 -鷲 -鷳 -鷸 -鷹 -鷺 -鸊 -鸌 -鸐 -鸑 -鸕 -鸘 -鸚 -鸛 -鸜 -鸝 -鸞 -鹮 -鹵 -鹹 -鹼 -鹽 -鹿 -麂 -麅 -麇 -麈 -麊 -麋 -麐 -麒 -麓 -麗 -麝 -麞 -麟 -麥 -麩 -麪 -麯 -麴 -麵 -麹 -麺 -麻 -麼 -麽 -麾 -麿 -黁 -黃 -黇 -黌 -黍 -黎 -黏 -黐 -黑 -黒 -黔 -默 -黙 -黛 -黜 -黝 -點 -黟 -黥 -黧 -黨 -黯 -黴 -黶 -黻 -黼 -黽 -黿 -鼂 -鼇 -鼈 -鼉 -鼎 -鼐 -鼒 -鼓 -鼕 -鼙 -鼠 -鼢 -鼩 -鼬 -鼯 -鼱 -鼴 -鼷 -鼻 -鼽 -鼾 -齊 -齋 -齒 -齕 -齡 -齣 -齦 -齧 -齲 -齶 -龍 -龎 -龐 -龑 -龔 -龕 -龜 -龝 -龠 -龢 -郎 -凉 -﹑ -﹗ -﹝ -﹞ -﹢ -! -" -# -$ -% -& -' -( -) -* -+ -, -- -. -/ -0 -1 -2 -3 -4 -5 -6 -7 -8 -9 -: -; -< -= -> -? -A -B -C -D -E -F -G -H -I -K -L -M -N -O -P -R -S -T -U -V -W -Y -Z -[ -] -` -a -b -c -d -e -f -g -h -i -j -k -l -m -n -o -p -r -s -t -u -z -{ -| -} -~ -¥ -𣇉 - diff --git a/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/utils/resources/dict/cyrillic_dict.txt b/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/utils/resources/dict/cyrillic_dict.txt deleted file mode 100644 index 2b6f66494d5417e18bbd225719aa72690e09e126..0000000000000000000000000000000000000000 --- a/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/utils/resources/dict/cyrillic_dict.txt +++ /dev/null @@ -1,163 +0,0 @@ - -! -# -$ -% -& -' -( -+ -, -- -. -/ -0 -1 -2 -3 -4 -5 -6 -7 -8 -9 -: -? -@ -A -B -C -D -E -F -G -H -I -J -K -L -M -N -O -P -Q -R -S -T -U -V -W -X -Y -Z -_ -a -b -c -d -e -f -g -h -i -j -k -l -m -n -o -p -q -r -s -t -u -v -w -x -y -z -É -é -Ё -Є -І -Ј -Љ -Ў -А -Б -В -Г -Д -Е -Ж -З -И -Й -К -Л -М -Н -О -П -Р -С -Т -У -Ф -Х -Ц -Ч -Ш -Щ -Ъ -Ы -Ь -Э -Ю -Я -а -б -в -г -д -е -ж -з -и -й -к -л -м -н -о -п -р -с -т -у -ф -х -ц -ч -ш -щ -ъ -ы -ь -э -ю -я -ё -ђ -є -і -ј -љ -њ -ћ -ў -џ -Ґ -ґ diff --git a/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/utils/resources/dict/devanagari_dict.txt b/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/utils/resources/dict/devanagari_dict.txt deleted file mode 100644 index f55923061bfd480b875bb3679d7a75a9157387a9..0000000000000000000000000000000000000000 --- a/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/utils/resources/dict/devanagari_dict.txt +++ /dev/null @@ -1,167 +0,0 @@ - -! -# -$ -% -& -' -( -+ -, -- -. -/ -0 -1 -2 -3 -4 -5 -6 -7 -8 -9 -: -? -@ -A -B -C -D -E -F -G -H -I -J -K -L -M -N -O -P -Q -R -S -T -U -V -W -X -Y -Z -_ -a -b -c -d -e -f -g -h -i -j -k -l -m -n -o -p -q -r -s -t -u -v -w -x -y -z -É -é -ँ -ं -ः -अ -आ -इ -ई -उ -ऊ -ऋ -ए -ऐ -ऑ -ओ -औ -क -ख -ग -घ -ङ -च -छ -ज -झ -ञ -ट -ठ -ड -ढ -ण -त -थ -द -ध -न -ऩ -प -फ -ब -भ -म -य -र -ऱ -ल -ळ -व -श -ष -स -ह -़ -ा -ि -ी -ु -ू -ृ -ॅ -े -ै -ॉ -ो -ौ -् -॒ -क़ -ख़ -ग़ -ज़ -ड़ -ढ़ -फ़ -ॠ -। -० -१ -२ -३ -४ -५ -६ -७ -८ -९ -॰ diff --git a/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/utils/resources/dict/en_dict.txt b/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/utils/resources/dict/en_dict.txt deleted file mode 100644 index 7677d31b9d3f08eef2823c2cf051beeab1f0470b..0000000000000000000000000000000000000000 --- a/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/utils/resources/dict/en_dict.txt +++ /dev/null @@ -1,95 +0,0 @@ -0 -1 -2 -3 -4 -5 -6 -7 -8 -9 -: -; -< -= -> -? -@ -A -B -C -D -E -F -G -H -I -J -K -L -M -N -O -P -Q -R -S -T -U -V -W -X -Y -Z -[ -\ -] -^ -_ -` -a -b -c -d -e -f -g -h -i -j -k -l -m -n -o -p -q -r -s -t -u -v -w -x -y -z -{ -| -} -~ -! -" -# -$ -% -& -' -( -) -* -+ -, -- -. -/ - diff --git a/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/utils/resources/dict/japan_dict.txt b/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/utils/resources/dict/japan_dict.txt deleted file mode 100644 index 339d4b89e5159a346636641a0814874faa59754a..0000000000000000000000000000000000000000 --- a/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/utils/resources/dict/japan_dict.txt +++ /dev/null @@ -1,4399 +0,0 @@ -! -" -# -$ -% -& -' -( -) -* -+ -, -- -. -/ -0 -1 -2 -3 -4 -5 -6 -7 -8 -9 -: -; -< -= -> -? -A -B -C -D -E -F -G -H -I -J -K -L -M -N -O -P -Q -R -S -T -U -V -W -X -Y -Z -[ -] -_ -` -a -b -c -d -e -f -g -h -i -j -k -l -m -n -o -p -q -r -s -t -u -v -w -x -y -z -© -° -² -´ -½ -Á -Ä -Å -Ç -È -É -Í -Ó -Ö -× -Ü -ß -à -á -â -ã -ä -å -æ -ç -è -é -ê -ë -í -ð -ñ -ò -ó -ô -õ -ö -ø -ú -û -ü -ý -ā -ă -ą -ć -Č -č -đ -ē -ė -ę -ğ -ī -ı -Ł -ł -ń -ň -ō -ř -Ş -ş -Š -š -ţ -ū -ż -Ž -ž -Ș -ș -ț -Δ -α -λ -μ -φ -Г -О -а -в -л -о -р -с -т -я -ồ -​ -— -― -’ -“ -” -… -℃ -→ -∇ -− -■ -☆ -  -、 -。 -々 -〆 -〈 -〉 -「 -」 -『 -』 -〔 -〕 -〜 -ぁ -あ -ぃ -い -う -ぇ -え -ぉ -お -か -が -き -ぎ -く -ぐ -け -げ -こ -ご -さ -ざ -し -じ -す -ず -せ -ぜ -そ -ぞ -た -だ -ち -ぢ -っ -つ -づ -て -で -と -ど -な -に -ぬ -ね -の -は -ば -ぱ -ひ -び -ぴ -ふ -ぶ -ぷ -へ -べ -ぺ -ほ -ぼ -ぽ -ま -み -む -め -も -ゃ -や -ゅ -ゆ -ょ -よ -ら -り -る -れ -ろ -わ -ゑ -を -ん -ゝ -ゞ -ァ -ア -ィ -イ -ゥ -ウ -ェ -エ -ォ -オ -カ -ガ -キ -ギ -ク -グ -ケ -ゲ -コ -ゴ -サ -ザ -シ -ジ -ス -ズ -セ -ゼ -ソ -ゾ -タ -ダ -チ -ヂ -ッ -ツ -ヅ -テ -デ -ト -ド -ナ -ニ -ヌ -ネ -ノ -ハ -バ -パ -ヒ -ビ -ピ -フ -ブ -プ -ヘ -ベ -ペ -ホ -ボ -ポ -マ -ミ -ム -メ -モ -ャ -ヤ -ュ -ユ -ョ -ヨ -ラ -リ -ル -レ -ロ -ワ -ヰ -ン -ヴ -ヵ -ヶ -・ -ー -㈱ -一 -丁 -七 -万 -丈 -三 -上 -下 -不 -与 -丑 -且 -世 -丘 -丙 -丞 -両 -並 -中 -串 -丸 -丹 -主 -丼 -丿 -乃 -久 -之 -乎 -乏 -乗 -乘 -乙 -九 -乞 -也 -乱 -乳 -乾 -亀 -了 -予 -争 -事 -二 -于 -互 -五 -井 -亘 -亙 -些 -亜 -亟 -亡 -交 -亥 -亦 -亨 -享 -京 -亭 -亮 -人 -什 -仁 -仇 -今 -介 -仍 -仏 -仔 -仕 -他 -仗 -付 -仙 -代 -令 -以 -仮 -仰 -仲 -件 -任 -企 -伊 -伍 -伎 -伏 -伐 -休 -会 -伝 -伯 -估 -伴 -伶 -伸 -伺 -似 -伽 -佃 -但 -位 -低 -住 -佐 -佑 -体 -何 -余 -佚 -佛 -作 -佩 -佳 -併 -佶 -使 -侈 -例 -侍 -侏 -侑 -侘 -供 -依 -侠 -価 -侮 -侯 -侵 -侶 -便 -係 -促 -俄 -俊 -俔 -俗 -俘 -保 -信 -俣 -俤 -修 -俯 -俳 -俵 -俸 -俺 -倉 -個 -倍 -倒 -候 -借 -倣 -値 -倫 -倭 -倶 -倹 -偃 -假 -偈 -偉 -偏 -偐 -偕 -停 -健 -側 -偵 -偶 -偽 -傀 -傅 -傍 -傑 -傘 -備 -催 -傭 -傲 -傳 -債 -傷 -傾 -僊 -働 -像 -僑 -僕 -僚 -僧 -僭 -僮 -儀 -億 -儇 -儒 -儛 -償 -儡 -優 -儲 -儺 -儼 -兀 -允 -元 -兄 -充 -兆 -先 -光 -克 -兌 -免 -兎 -児 -党 -兜 -入 -全 -八 -公 -六 -共 -兵 -其 -具 -典 -兼 -内 -円 -冊 -再 -冑 -冒 -冗 -写 -冠 -冤 -冥 -冨 -冬 -冲 -决 -冶 -冷 -准 -凉 -凋 -凌 -凍 -凛 -凝 -凞 -几 -凡 -処 -凪 -凰 -凱 -凶 -凸 -凹 -出 -函 -刀 -刃 -分 -切 -刈 -刊 -刎 -刑 -列 -初 -判 -別 -利 -刪 -到 -制 -刷 -券 -刹 -刺 -刻 -剃 -則 -削 -剋 -前 -剖 -剛 -剣 -剤 -剥 -剪 -副 -剰 -割 -創 -剽 -劇 -劉 -劔 -力 -功 -加 -劣 -助 -努 -劫 -劭 -励 -労 -効 -劾 -勃 -勅 -勇 -勉 -勒 -動 -勘 -務 -勝 -募 -勢 -勤 -勧 -勲 -勺 -勾 -勿 -匁 -匂 -包 -匏 -化 -北 -匙 -匝 -匠 -匡 -匣 -匯 -匲 -匹 -区 -医 -匿 -十 -千 -升 -午 -卉 -半 -卍 -卑 -卒 -卓 -協 -南 -単 -博 -卜 -占 -卦 -卯 -印 -危 -即 -却 -卵 -卸 -卿 -厄 -厚 -原 -厠 -厨 -厩 -厭 -厳 -去 -参 -又 -叉 -及 -友 -双 -反 -収 -叔 -取 -受 -叙 -叛 -叟 -叡 -叢 -口 -古 -句 -叩 -只 -叫 -召 -可 -台 -叱 -史 -右 -叶 -号 -司 -吃 -各 -合 -吉 -吊 -同 -名 -后 -吏 -吐 -向 -君 -吝 -吟 -吠 -否 -含 -吸 -吹 -吻 -吽 -吾 -呂 -呆 -呈 -呉 -告 -呑 -周 -呪 -呰 -味 -呼 -命 -咀 -咄 -咋 -和 -咒 -咫 -咲 -咳 -咸 -哀 -品 -哇 -哉 -員 -哨 -哩 -哭 -哲 -哺 -唄 -唆 -唇 -唐 -唖 -唯 -唱 -唳 -唸 -唾 -啄 -商 -問 -啓 -啼 -善 -喋 -喚 -喜 -喝 -喧 -喩 -喪 -喫 -喬 -單 -喰 -営 -嗅 -嗇 -嗔 -嗚 -嗜 -嗣 -嘆 -嘉 -嘗 -嘘 -嘩 -嘯 -嘱 -嘲 -嘴 -噂 -噌 -噛 -器 -噴 -噺 -嚆 -嚢 -囀 -囃 -囉 -囚 -四 -回 -因 -団 -困 -囲 -図 -固 -国 -圀 -圃 -國 -圏 -園 -圓 -團 -圜 -土 -圧 -在 -圭 -地 -址 -坂 -均 -坊 -坐 -坑 -坡 -坤 -坦 -坪 -垂 -型 -垢 -垣 -埃 -埋 -城 -埒 -埔 -域 -埠 -埴 -埵 -執 -培 -基 -埼 -堀 -堂 -堅 -堆 -堕 -堤 -堪 -堯 -堰 -報 -場 -堵 -堺 -塀 -塁 -塊 -塑 -塔 -塗 -塘 -塙 -塚 -塞 -塩 -填 -塵 -塾 -境 -墉 -墓 -増 -墜 -墟 -墨 -墳 -墺 -墻 -墾 -壁 -壇 -壊 -壌 -壕 -士 -壬 -壮 -声 -壱 -売 -壷 -壹 -壺 -壽 -変 -夏 -夕 -外 -夙 -多 -夜 -夢 -夥 -大 -天 -太 -夫 -夬 -夭 -央 -失 -夷 -夾 -奄 -奇 -奈 -奉 -奎 -奏 -契 -奔 -奕 -套 -奘 -奠 -奢 -奥 -奨 -奪 -奮 -女 -奴 -奸 -好 -如 -妃 -妄 -妊 -妍 -妓 -妖 -妙 -妥 -妨 -妬 -妲 -妹 -妻 -妾 -姉 -始 -姐 -姓 -委 -姚 -姜 -姞 -姥 -姦 -姨 -姪 -姫 -姶 -姻 -姿 -威 -娑 -娘 -娟 -娠 -娩 -娯 -娼 -婆 -婉 -婚 -婢 -婦 -婬 -婿 -媄 -媒 -媓 -媚 -媛 -媞 -媽 -嫁 -嫄 -嫉 -嫌 -嫐 -嫗 -嫡 -嬉 -嬌 -嬢 -嬪 -嬬 -嬾 -孁 -子 -孔 -字 -存 -孚 -孝 -孟 -季 -孤 -学 -孫 -孵 -學 -宅 -宇 -守 -安 -宋 -完 -宍 -宏 -宕 -宗 -官 -宙 -定 -宛 -宜 -宝 -実 -客 -宣 -室 -宥 -宮 -宰 -害 -宴 -宵 -家 -宸 -容 -宿 -寂 -寄 -寅 -密 -寇 -富 -寒 -寓 -寔 -寛 -寝 -察 -寡 -實 -寧 -審 -寮 -寵 -寶 -寸 -寺 -対 -寿 -封 -専 -射 -将 -尉 -尊 -尋 -對 -導 -小 -少 -尖 -尚 -尤 -尪 -尭 -就 -尹 -尺 -尻 -尼 -尽 -尾 -尿 -局 -居 -屈 -届 -屋 -屍 -屎 -屏 -屑 -屓 -展 -属 -屠 -層 -履 -屯 -山 -岐 -岑 -岡 -岩 -岫 -岬 -岳 -岷 -岸 -峠 -峡 -峨 -峯 -峰 -島 -峻 -崇 -崋 -崎 -崑 -崖 -崗 -崛 -崩 -嵌 -嵐 -嵩 -嵯 -嶂 -嶋 -嶠 -嶺 -嶼 -嶽 -巀 -巌 -巒 -巖 -川 -州 -巡 -巣 -工 -左 -巧 -巨 -巫 -差 -己 -巳 -巴 -巷 -巻 -巽 -巾 -市 -布 -帆 -希 -帖 -帚 -帛 -帝 -帥 -師 -席 -帯 -帰 -帳 -帷 -常 -帽 -幄 -幅 -幇 -幌 -幔 -幕 -幟 -幡 -幢 -幣 -干 -平 -年 -并 -幸 -幹 -幻 -幼 -幽 -幾 -庁 -広 -庄 -庇 -床 -序 -底 -庖 -店 -庚 -府 -度 -座 -庫 -庭 -庵 -庶 -康 -庸 -廂 -廃 -廉 -廊 -廓 -廟 -廠 -廣 -廬 -延 -廷 -建 -廻 -廼 -廿 -弁 -弄 -弉 -弊 -弌 -式 -弐 -弓 -弔 -引 -弖 -弗 -弘 -弛 -弟 -弥 -弦 -弧 -弱 -張 -強 -弼 -弾 -彈 -彊 -彌 -彎 -当 -彗 -彙 -彝 -形 -彦 -彩 -彫 -彬 -彭 -彰 -影 -彷 -役 -彼 -往 -征 -徂 -径 -待 -律 -後 -徐 -徑 -徒 -従 -得 -徠 -御 -徧 -徨 -復 -循 -徭 -微 -徳 -徴 -德 -徹 -徽 -心 -必 -忉 -忌 -忍 -志 -忘 -忙 -応 -忠 -快 -忯 -念 -忻 -忽 -忿 -怒 -怖 -思 -怠 -怡 -急 -性 -怨 -怪 -怯 -恂 -恋 -恐 -恒 -恕 -恣 -恤 -恥 -恨 -恩 -恬 -恭 -息 -恵 -悉 -悌 -悍 -悔 -悟 -悠 -患 -悦 -悩 -悪 -悲 -悼 -情 -惇 -惑 -惚 -惜 -惟 -惠 -惣 -惧 -惨 -惰 -想 -惹 -惺 -愈 -愉 -愍 -意 -愔 -愚 -愛 -感 -愷 -愿 -慈 -態 -慌 -慎 -慕 -慢 -慣 -慧 -慨 -慮 -慰 -慶 -憂 -憎 -憐 -憑 -憙 -憤 -憧 -憩 -憬 -憲 -憶 -憾 -懇 -應 -懌 -懐 -懲 -懸 -懺 -懽 -懿 -戈 -戊 -戌 -戎 -成 -我 -戒 -戔 -或 -戚 -戟 -戦 -截 -戮 -戯 -戴 -戸 -戻 -房 -所 -扁 -扇 -扈 -扉 -手 -才 -打 -払 -托 -扮 -扱 -扶 -批 -承 -技 -抄 -把 -抑 -抓 -投 -抗 -折 -抜 -択 -披 -抱 -抵 -抹 -押 -抽 -担 -拇 -拈 -拉 -拍 -拏 -拐 -拒 -拓 -拘 -拙 -招 -拝 -拠 -拡 -括 -拭 -拳 -拵 -拶 -拾 -拿 -持 -挂 -指 -按 -挑 -挙 -挟 -挨 -振 -挺 -挽 -挿 -捉 -捕 -捗 -捜 -捧 -捨 -据 -捺 -捻 -掃 -掄 -授 -掌 -排 -掖 -掘 -掛 -掟 -採 -探 -掣 -接 -控 -推 -掩 -措 -掬 -掲 -掴 -掻 -掾 -揃 -揄 -揆 -揉 -描 -提 -揖 -揚 -換 -握 -揮 -援 -揶 -揺 -損 -搦 -搬 -搭 -携 -搾 -摂 -摘 -摩 -摸 -摺 -撃 -撒 -撞 -撤 -撥 -撫 -播 -撮 -撰 -撲 -撹 -擁 -操 -擔 -擦 -擬 -擾 -攘 -攝 -攣 -支 -收 -改 -攻 -放 -政 -故 -敏 -救 -敗 -教 -敢 -散 -敦 -敬 -数 -整 -敵 -敷 -斂 -文 -斉 -斎 -斐 -斑 -斗 -料 -斜 -斟 -斤 -斥 -斧 -斬 -断 -斯 -新 -方 -於 -施 -旁 -旅 -旋 -旌 -族 -旗 -旛 -无 -旡 -既 -日 -旦 -旧 -旨 -早 -旬 -旭 -旺 -旻 -昂 -昆 -昇 -昉 -昌 -明 -昏 -易 -昔 -星 -映 -春 -昧 -昨 -昪 -昭 -是 -昵 -昼 -晁 -時 -晃 -晋 -晏 -晒 -晟 -晦 -晧 -晩 -普 -景 -晴 -晶 -智 -暁 -暇 -暈 -暉 -暑 -暖 -暗 -暘 -暢 -暦 -暫 -暮 -暲 -暴 -暹 -暾 -曄 -曇 -曉 -曖 -曙 -曜 -曝 -曠 -曰 -曲 -曳 -更 -書 -曹 -曼 -曽 -曾 -替 -最 -會 -月 -有 -朋 -服 -朏 -朔 -朕 -朗 -望 -朝 -期 -朧 -木 -未 -末 -本 -札 -朱 -朴 -机 -朽 -杁 -杉 -李 -杏 -材 -村 -杓 -杖 -杜 -杞 -束 -条 -杢 -杣 -来 -杭 -杮 -杯 -東 -杲 -杵 -杷 -杼 -松 -板 -枅 -枇 -析 -枓 -枕 -林 -枚 -果 -枝 -枠 -枡 -枢 -枯 -枳 -架 -柄 -柊 -柏 -某 -柑 -染 -柔 -柘 -柚 -柯 -柱 -柳 -柴 -柵 -査 -柾 -柿 -栂 -栃 -栄 -栖 -栗 -校 -株 -栲 -栴 -核 -根 -栻 -格 -栽 -桁 -桂 -桃 -框 -案 -桐 -桑 -桓 -桔 -桜 -桝 -桟 -桧 -桴 -桶 -桾 -梁 -梅 -梆 -梓 -梔 -梗 -梛 -條 -梟 -梢 -梧 -梨 -械 -梱 -梲 -梵 -梶 -棄 -棋 -棒 -棗 -棘 -棚 -棟 -棠 -森 -棲 -棹 -棺 -椀 -椅 -椋 -植 -椎 -椏 -椒 -椙 -検 -椥 -椹 -椿 -楊 -楓 -楕 -楚 -楞 -楠 -楡 -楢 -楨 -楪 -楫 -業 -楮 -楯 -楳 -極 -楷 -楼 -楽 -概 -榊 -榎 -榕 -榛 -榜 -榮 -榱 -榴 -槃 -槇 -槊 -構 -槌 -槍 -槐 -様 -槙 -槻 -槽 -槿 -樂 -樋 -樓 -樗 -標 -樟 -模 -権 -横 -樫 -樵 -樹 -樺 -樽 -橇 -橋 -橘 -機 -橿 -檀 -檄 -檎 -檐 -檗 -檜 -檣 -檥 -檬 -檮 -檸 -檻 -櫃 -櫓 -櫛 -櫟 -櫨 -櫻 -欄 -欅 -欠 -次 -欣 -欧 -欲 -欺 -欽 -款 -歌 -歎 -歓 -止 -正 -此 -武 -歩 -歪 -歯 -歳 -歴 -死 -殆 -殉 -殊 -残 -殖 -殯 -殴 -段 -殷 -殺 -殻 -殿 -毀 -毅 -母 -毎 -毒 -比 -毘 -毛 -毫 -毬 -氈 -氏 -民 -気 -水 -氷 -永 -氾 -汀 -汁 -求 -汎 -汐 -汗 -汚 -汝 -江 -池 -汪 -汰 -汲 -決 -汽 -沂 -沃 -沅 -沆 -沈 -沌 -沐 -沓 -沖 -沙 -没 -沢 -沱 -河 -沸 -油 -治 -沼 -沽 -沿 -況 -泉 -泊 -泌 -法 -泗 -泡 -波 -泣 -泥 -注 -泯 -泰 -泳 -洋 -洒 -洗 -洛 -洞 -津 -洩 -洪 -洲 -洸 -洹 -活 -洽 -派 -流 -浄 -浅 -浙 -浚 -浜 -浣 -浦 -浩 -浪 -浮 -浴 -海 -浸 -涅 -消 -涌 -涙 -涛 -涯 -液 -涵 -涼 -淀 -淄 -淆 -淇 -淋 -淑 -淘 -淡 -淤 -淨 -淫 -深 -淳 -淵 -混 -淹 -添 -清 -済 -渉 -渋 -渓 -渕 -渚 -減 -渟 -渠 -渡 -渤 -渥 -渦 -温 -渫 -測 -港 -游 -渾 -湊 -湖 -湘 -湛 -湧 -湫 -湯 -湾 -湿 -満 -源 -準 -溜 -溝 -溢 -溥 -溪 -溶 -溺 -滄 -滅 -滋 -滌 -滑 -滕 -滝 -滞 -滴 -滸 -滹 -滿 -漁 -漂 -漆 -漉 -漏 -漑 -演 -漕 -漠 -漢 -漣 -漫 -漬 -漱 -漸 -漿 -潅 -潔 -潙 -潜 -潟 -潤 -潭 -潮 -潰 -潴 -澁 -澂 -澄 -澎 -澗 -澤 -澪 -澱 -澳 -激 -濁 -濃 -濟 -濠 -濡 -濤 -濫 -濯 -濱 -濾 -瀉 -瀋 -瀑 -瀕 -瀞 -瀟 -瀧 -瀬 -瀾 -灌 -灑 -灘 -火 -灯 -灰 -灸 -災 -炉 -炊 -炎 -炒 -炭 -炮 -炷 -点 -為 -烈 -烏 -烙 -烝 -烹 -焔 -焙 -焚 -無 -焦 -然 -焼 -煇 -煉 -煌 -煎 -煕 -煙 -煤 -煥 -照 -煩 -煬 -煮 -煽 -熈 -熊 -熙 -熟 -熨 -熱 -熹 -熾 -燃 -燈 -燎 -燔 -燕 -燗 -燥 -燭 -燻 -爆 -爐 -爪 -爬 -爲 -爵 -父 -爺 -爼 -爽 -爾 -片 -版 -牌 -牒 -牘 -牙 -牛 -牝 -牟 -牡 -牢 -牧 -物 -牲 -特 -牽 -犂 -犠 -犬 -犯 -状 -狂 -狄 -狐 -狗 -狙 -狛 -狡 -狩 -独 -狭 -狷 -狸 -狼 -猊 -猛 -猟 -猥 -猨 -猩 -猪 -猫 -献 -猴 -猶 -猷 -猾 -猿 -獄 -獅 -獏 -獣 -獲 -玄 -玅 -率 -玉 -王 -玖 -玩 -玲 -珀 -珂 -珈 -珉 -珊 -珍 -珎 -珞 -珠 -珣 -珥 -珪 -班 -現 -球 -理 -琉 -琢 -琥 -琦 -琮 -琲 -琳 -琴 -琵 -琶 -瑁 -瑋 -瑙 -瑚 -瑛 -瑜 -瑞 -瑠 -瑤 -瑩 -瑪 -瑳 -瑾 -璃 -璋 -璜 -璞 -璧 -璨 -環 -璵 -璽 -璿 -瓊 -瓔 -瓜 -瓢 -瓦 -瓶 -甍 -甑 -甕 -甘 -甚 -甞 -生 -産 -甥 -用 -甫 -田 -由 -甲 -申 -男 -町 -画 -界 -畏 -畑 -畔 -留 -畜 -畝 -畠 -畢 -略 -番 -異 -畳 -當 -畷 -畸 -畺 -畿 -疆 -疇 -疋 -疎 -疏 -疑 -疫 -疱 -疲 -疹 -疼 -疾 -病 -症 -痒 -痔 -痕 -痘 -痙 -痛 -痢 -痩 -痴 -痺 -瘍 -瘡 -瘧 -療 -癇 -癌 -癒 -癖 -癡 -癪 -発 -登 -白 -百 -的 -皆 -皇 -皋 -皐 -皓 -皮 -皺 -皿 -盂 -盃 -盆 -盈 -益 -盒 -盗 -盛 -盞 -盟 -盡 -監 -盤 -盥 -盧 -目 -盲 -直 -相 -盾 -省 -眉 -看 -県 -眞 -真 -眠 -眷 -眺 -眼 -着 -睡 -督 -睦 -睨 -睿 -瞋 -瞑 -瞞 -瞬 -瞭 -瞰 -瞳 -瞻 -瞼 -瞿 -矍 -矛 -矜 -矢 -知 -矧 -矩 -短 -矮 -矯 -石 -砂 -砌 -研 -砕 -砥 -砦 -砧 -砲 -破 -砺 -硝 -硫 -硬 -硯 -碁 -碇 -碌 -碑 -碓 -碕 -碗 -碣 -碧 -碩 -確 -碾 -磁 -磐 -磔 -磧 -磨 -磬 -磯 -礁 -礎 -礒 -礙 -礫 -礬 -示 -礼 -社 -祀 -祁 -祇 -祈 -祉 -祐 -祓 -祕 -祖 -祗 -祚 -祝 -神 -祟 -祠 -祢 -祥 -票 -祭 -祷 -祺 -禁 -禄 -禅 -禊 -禍 -禎 -福 -禔 -禖 -禛 -禦 -禧 -禮 -禰 -禹 -禽 -禿 -秀 -私 -秋 -科 -秒 -秘 -租 -秤 -秦 -秩 -称 -移 -稀 -程 -税 -稔 -稗 -稙 -稚 -稜 -稠 -種 -稱 -稲 -稷 -稻 -稼 -稽 -稿 -穀 -穂 -穆 -積 -穎 -穏 -穗 -穜 -穢 -穣 -穫 -穴 -究 -空 -突 -窃 -窄 -窒 -窓 -窟 -窠 -窩 -窪 -窮 -窯 -竃 -竄 -竈 -立 -站 -竜 -竝 -竟 -章 -童 -竪 -竭 -端 -竴 -競 -竹 -竺 -竽 -竿 -笄 -笈 -笏 -笑 -笙 -笛 -笞 -笠 -笥 -符 -第 -笹 -筅 -筆 -筇 -筈 -等 -筋 -筌 -筍 -筏 -筐 -筑 -筒 -答 -策 -筝 -筥 -筧 -筬 -筮 -筯 -筰 -筵 -箆 -箇 -箋 -箏 -箒 -箔 -箕 -算 -箙 -箜 -管 -箪 -箭 -箱 -箸 -節 -篁 -範 -篆 -篇 -築 -篋 -篌 -篝 -篠 -篤 -篥 -篦 -篩 -篭 -篳 -篷 -簀 -簒 -簡 -簧 -簪 -簫 -簺 -簾 -簿 -籀 -籃 -籌 -籍 -籐 -籟 -籠 -籤 -籬 -米 -籾 -粂 -粉 -粋 -粒 -粕 -粗 -粘 -粛 -粟 -粥 -粧 -粮 -粳 -精 -糊 -糖 -糜 -糞 -糟 -糠 -糧 -糯 -糸 -糺 -系 -糾 -紀 -約 -紅 -紋 -納 -紐 -純 -紗 -紘 -紙 -級 -紛 -素 -紡 -索 -紫 -紬 -累 -細 -紳 -紵 -紹 -紺 -絁 -終 -絃 -組 -絅 -経 -結 -絖 -絞 -絡 -絣 -給 -統 -絲 -絵 -絶 -絹 -絽 -綏 -經 -継 -続 -綜 -綟 -綬 -維 -綱 -網 -綴 -綸 -綺 -綽 -綾 -綿 -緊 -緋 -総 -緑 -緒 -線 -締 -緥 -編 -緩 -緬 -緯 -練 -緻 -縁 -縄 -縅 -縒 -縛 -縞 -縢 -縣 -縦 -縫 -縮 -縹 -總 -績 -繁 -繊 -繋 -繍 -織 -繕 -繝 -繦 -繧 -繰 -繹 -繼 -纂 -纈 -纏 -纐 -纒 -纛 -缶 -罔 -罠 -罧 -罪 -置 -罰 -署 -罵 -罷 -罹 -羂 -羅 -羆 -羇 -羈 -羊 -羌 -美 -群 -羨 -義 -羯 -羲 -羹 -羽 -翁 -翅 -翌 -習 -翔 -翛 -翠 -翡 -翫 -翰 -翺 -翻 -翼 -耀 -老 -考 -者 -耆 -而 -耐 -耕 -耗 -耨 -耳 -耶 -耽 -聊 -聖 -聘 -聚 -聞 -聟 -聡 -聨 -聯 -聰 -聲 -聴 -職 -聾 -肄 -肆 -肇 -肉 -肋 -肌 -肖 -肘 -肛 -肝 -股 -肢 -肥 -肩 -肪 -肯 -肱 -育 -肴 -肺 -胃 -胆 -背 -胎 -胖 -胚 -胝 -胞 -胡 -胤 -胱 -胴 -胸 -能 -脂 -脅 -脆 -脇 -脈 -脊 -脚 -脛 -脩 -脱 -脳 -腋 -腎 -腐 -腑 -腔 -腕 -腫 -腰 -腱 -腸 -腹 -腺 -腿 -膀 -膏 -膚 -膜 -膝 -膠 -膣 -膨 -膩 -膳 -膵 -膾 -膿 -臂 -臆 -臈 -臍 -臓 -臘 -臚 -臣 -臥 -臨 -自 -臭 -至 -致 -臺 -臼 -舂 -舅 -與 -興 -舌 -舍 -舎 -舒 -舖 -舗 -舘 -舜 -舞 -舟 -舩 -航 -般 -舳 -舶 -船 -艇 -艘 -艦 -艮 -良 -色 -艶 -芋 -芒 -芙 -芝 -芥 -芦 -芬 -芭 -芯 -花 -芳 -芸 -芹 -芻 -芽 -芿 -苅 -苑 -苔 -苗 -苛 -苞 -苡 -若 -苦 -苧 -苫 -英 -苴 -苻 -茂 -范 -茄 -茅 -茎 -茗 -茘 -茜 -茨 -茲 -茵 -茶 -茸 -茹 -草 -荊 -荏 -荒 -荘 -荷 -荻 -荼 -莞 -莪 -莫 -莬 -莱 -莵 -莽 -菅 -菊 -菌 -菓 -菖 -菘 -菜 -菟 -菩 -菫 -華 -菱 -菴 -萄 -萊 -萌 -萍 -萎 -萠 -萩 -萬 -萱 -落 -葉 -著 -葛 -葡 -董 -葦 -葩 -葬 -葭 -葱 -葵 -葺 -蒋 -蒐 -蒔 -蒙 -蒟 -蒡 -蒲 -蒸 -蒻 -蒼 -蒿 -蓄 -蓆 -蓉 -蓋 -蓑 -蓬 -蓮 -蓼 -蔀 -蔑 -蔓 -蔚 -蔡 -蔦 -蔬 -蔭 -蔵 -蔽 -蕃 -蕉 -蕊 -蕎 -蕨 -蕩 -蕪 -蕭 -蕾 -薄 -薇 -薊 -薔 -薗 -薙 -薛 -薦 -薨 -薩 -薪 -薫 -薬 -薭 -薮 -藁 -藉 -藍 -藏 -藐 -藝 -藤 -藩 -藪 -藷 -藹 -藺 -藻 -蘂 -蘆 -蘇 -蘊 -蘭 -虎 -虐 -虔 -虚 -虜 -虞 -號 -虫 -虹 -虻 -蚊 -蚕 -蛇 -蛉 -蛍 -蛎 -蛙 -蛛 -蛟 -蛤 -蛭 -蛮 -蛸 -蛹 -蛾 -蜀 -蜂 -蜃 -蜆 -蜊 -蜘 -蜜 -蜷 -蜻 -蝉 -蝋 -蝕 -蝙 -蝠 -蝦 -蝶 -蝿 -螂 -融 -螣 -螺 -蟄 -蟇 -蟠 -蟷 -蟹 -蟻 -蠢 -蠣 -血 -衆 -行 -衍 -衒 -術 -街 -衙 -衛 -衝 -衞 -衡 -衢 -衣 -表 -衫 -衰 -衵 -衷 -衽 -衾 -衿 -袁 -袈 -袋 -袍 -袒 -袖 -袙 -袞 -袢 -被 -袰 -袱 -袴 -袷 -袿 -裁 -裂 -裃 -装 -裏 -裔 -裕 -裘 -裙 -補 -裟 -裡 -裲 -裳 -裴 -裸 -裹 -製 -裾 -褂 -褄 -複 -褌 -褐 -褒 -褥 -褪 -褶 -褻 -襄 -襖 -襞 -襟 -襠 -襦 -襪 -襲 -襴 -襷 -西 -要 -覆 -覇 -覈 -見 -規 -視 -覗 -覚 -覧 -親 -覲 -観 -覺 -觀 -角 -解 -触 -言 -訂 -計 -討 -訓 -託 -記 -訛 -訟 -訢 -訥 -訪 -設 -許 -訳 -訴 -訶 -診 -註 -証 -詐 -詔 -評 -詛 -詞 -詠 -詢 -詣 -試 -詩 -詫 -詮 -詰 -話 -該 -詳 -誄 -誅 -誇 -誉 -誌 -認 -誓 -誕 -誘 -語 -誠 -誡 -誣 -誤 -誥 -誦 -説 -読 -誰 -課 -誼 -誾 -調 -談 -請 -諌 -諍 -諏 -諒 -論 -諚 -諜 -諟 -諡 -諦 -諧 -諫 -諭 -諮 -諱 -諶 -諷 -諸 -諺 -諾 -謀 -謄 -謌 -謎 -謗 -謙 -謚 -講 -謝 -謡 -謫 -謬 -謹 -證 -識 -譚 -譛 -譜 -警 -譬 -譯 -議 -譲 -譴 -護 -讀 -讃 -讐 -讒 -谷 -谿 -豅 -豆 -豊 -豎 -豐 -豚 -象 -豪 -豫 -豹 -貌 -貝 -貞 -負 -財 -貢 -貧 -貨 -販 -貪 -貫 -責 -貯 -貰 -貴 -買 -貸 -費 -貼 -貿 -賀 -賁 -賂 -賃 -賄 -資 -賈 -賊 -賎 -賑 -賓 -賛 -賜 -賞 -賠 -賢 -賣 -賤 -賦 -質 -賭 -購 -賽 -贄 -贅 -贈 -贋 -贔 -贖 -赤 -赦 -走 -赴 -起 -超 -越 -趙 -趣 -足 -趺 -趾 -跋 -跏 -距 -跡 -跨 -跪 -路 -跳 -践 -踊 -踏 -踐 -踞 -踪 -踵 -蹄 -蹉 -蹊 -蹟 -蹲 -蹴 -躅 -躇 -躊 -躍 -躑 -躙 -躪 -身 -躬 -躯 -躰 -車 -軋 -軌 -軍 -軒 -軟 -転 -軸 -軻 -軽 -軾 -較 -載 -輌 -輔 -輜 -輝 -輦 -輩 -輪 -輯 -輸 -輿 -轄 -轍 -轟 -轢 -辛 -辞 -辟 -辥 -辦 -辨 -辰 -辱 -農 -辺 -辻 -込 -迂 -迅 -迎 -近 -返 -迢 -迦 -迪 -迫 -迭 -述 -迷 -迹 -追 -退 -送 -逃 -逅 -逆 -逍 -透 -逐 -逓 -途 -逕 -逗 -這 -通 -逝 -逞 -速 -造 -逢 -連 -逮 -週 -進 -逸 -逼 -遁 -遂 -遅 -遇 -遊 -運 -遍 -過 -遐 -道 -達 -違 -遙 -遜 -遠 -遡 -遣 -遥 -適 -遭 -遮 -遯 -遵 -遷 -選 -遺 -遼 -避 -邀 -邁 -邂 -邃 -還 -邇 -邉 -邊 -邑 -那 -邦 -邨 -邪 -邯 -邵 -邸 -郁 -郊 -郎 -郡 -郢 -部 -郭 -郴 -郵 -郷 -都 -鄂 -鄙 -鄭 -鄰 -鄲 -酉 -酋 -酌 -配 -酎 -酒 -酔 -酢 -酥 -酪 -酬 -酵 -酷 -酸 -醍 -醐 -醒 -醗 -醜 -醤 -醪 -醵 -醸 -采 -釈 -釉 -釋 -里 -重 -野 -量 -釐 -金 -釘 -釜 -針 -釣 -釧 -釿 -鈍 -鈎 -鈐 -鈔 -鈞 -鈦 -鈴 -鈷 -鈸 -鈿 -鉄 -鉇 -鉉 -鉋 -鉛 -鉢 -鉤 -鉦 -鉱 -鉾 -銀 -銃 -銅 -銈 -銑 -銕 -銘 -銚 -銜 -銭 -鋏 -鋒 -鋤 -鋭 -鋲 -鋳 -鋸 -鋺 -鋼 -錆 -錍 -錐 -錘 -錠 -錣 -錦 -錫 -錬 -錯 -録 -錵 -鍋 -鍍 -鍑 -鍔 -鍛 -鍬 -鍮 -鍵 -鍼 -鍾 -鎌 -鎖 -鎗 -鎚 -鎧 -鎬 -鎮 -鎰 -鎹 -鏃 -鏑 -鏡 -鐃 -鐇 -鐐 -鐔 -鐘 -鐙 -鐚 -鐡 -鐵 -鐸 -鑁 -鑊 -鑑 -鑒 -鑚 -鑠 -鑢 -鑰 -鑵 -鑷 -鑼 -鑽 -鑿 -長 -門 -閃 -閇 -閉 -開 -閏 -閑 -間 -閔 -閘 -関 -閣 -閤 -閥 -閦 -閨 -閬 -閲 -閻 -閼 -閾 -闇 -闍 -闔 -闕 -闘 -關 -闡 -闢 -闥 -阜 -阪 -阮 -阯 -防 -阻 -阿 -陀 -陂 -附 -陌 -降 -限 -陛 -陞 -院 -陣 -除 -陥 -陪 -陬 -陰 -陳 -陵 -陶 -陸 -険 -陽 -隅 -隆 -隈 -隊 -隋 -階 -随 -隔 -際 -障 -隠 -隣 -隧 -隷 -隻 -隼 -雀 -雁 -雄 -雅 -集 -雇 -雉 -雊 -雋 -雌 -雍 -雑 -雖 -雙 -雛 -離 -難 -雨 -雪 -雫 -雰 -雲 -零 -雷 -雹 -電 -需 -震 -霊 -霍 -霖 -霜 -霞 -霧 -霰 -露 -靈 -青 -靖 -静 -靜 -非 -面 -革 -靫 -靭 -靱 -靴 -靺 -鞁 -鞄 -鞆 -鞋 -鞍 -鞏 -鞘 -鞠 -鞨 -鞭 -韋 -韓 -韜 -韮 -音 -韶 -韻 -響 -頁 -頂 -頃 -項 -順 -須 -頌 -預 -頑 -頒 -頓 -領 -頚 -頬 -頭 -頴 -頸 -頻 -頼 -顆 -題 -額 -顎 -顔 -顕 -顗 -願 -顛 -類 -顧 -顯 -風 -飛 -食 -飢 -飩 -飫 -飯 -飲 -飴 -飼 -飽 -飾 -餃 -餅 -餉 -養 -餌 -餐 -餓 -餘 -餝 -餡 -館 -饂 -饅 -饉 -饋 -饌 -饒 -饗 -首 -馗 -香 -馨 -馬 -馳 -馴 -駄 -駅 -駆 -駈 -駐 -駒 -駕 -駝 -駿 -騁 -騎 -騏 -騒 -験 -騙 -騨 -騰 -驕 -驚 -驛 -驢 -骨 -骸 -髄 -體 -高 -髙 -髢 -髪 -髭 -髮 -髷 -髻 -鬘 -鬚 -鬢 -鬨 -鬯 -鬱 -鬼 -魁 -魂 -魄 -魅 -魏 -魔 -魚 -魯 -鮎 -鮑 -鮒 -鮪 -鮫 -鮭 -鮮 -鯉 -鯔 -鯖 -鯛 -鯨 -鯰 -鯱 -鰐 -鰒 -鰭 -鰯 -鰰 -鰹 -鰻 -鱈 -鱒 -鱗 -鱧 -鳥 -鳩 -鳰 -鳳 -鳴 -鳶 -鴈 -鴉 -鴎 -鴛 -鴟 -鴦 -鴨 -鴫 -鴻 -鵄 -鵜 -鵞 -鵡 -鵬 -鵲 -鵺 -鶉 -鶏 -鶯 -鶴 -鷄 -鷙 -鷲 -鷹 -鷺 -鸚 -鸞 -鹸 -鹽 -鹿 -麁 -麒 -麓 -麗 -麝 -麞 -麟 -麦 -麩 -麹 -麺 -麻 -麾 -麿 -黄 -黌 -黍 -黒 -黙 -黛 -黠 -鼈 -鼉 -鼎 -鼓 -鼠 -鼻 -齊 -齋 -齟 -齢 -齬 -龍 -龕 -龗 -! -# -% -& -( -) -+ -, -- -. -/ -0 -1 -2 -3 -4 -5 -6 -7 -8 -9 -: -; -= -? -@ -A -B -C -D -E -F -G -H -I -J -K -L -M -N -O -P -R -S -T -U -V -W -X -Z -a -c -d -e -f -h -i -j -k -l -m -n -o -p -r -s -t -u -y -z -~ -・ - diff --git a/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/utils/resources/dict/ka_dict.txt b/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/utils/resources/dict/ka_dict.txt deleted file mode 100644 index d506b691bd1a6c55299ad89a72cf3a69a2c879a9..0000000000000000000000000000000000000000 --- a/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/utils/resources/dict/ka_dict.txt +++ /dev/null @@ -1,153 +0,0 @@ -k -a -_ -i -m -g -/ -1 -2 -I -L -S -V -R -C -0 -v -l -6 -4 -8 -. -j -p -ಗ -ು -ಣ -ಪ -ಡ -ಿ -ಸ -ಲ -ಾ -ದ -್ -7 -5 -3 -ವ -ಷ -ಬ -ಹ -ೆ -9 -ಅ -ಳ -ನ -ರ -ಉ -ಕ -ಎ -ೇ -ಂ -ೈ -ೊ -ೀ -ಯ -ೋ -ತ -ಶ -ಭ -ಧ -ಚ -ಜ -ೂ -ಮ -ಒ -ೃ -ಥ -ಇ -ಟ -ಖ -ಆ -ಞ -ಫ -- -ಢ -ಊ -ಓ -ಐ -ಃ -ಘ -ಝ -ೌ -ಠ -ಛ -ಔ -ಏ -ಈ -ಋ -೨ -೦ -೧ -೮ -೯ -೪ -, -೫ -೭ -೩ -೬ -ಙ -s -c -e -n -w -o -u -t -d -E -A -T -B -Z -N -G -O -q -z -r -x -P -K -M -J -U -D -f -F -h -b -W -Y -y -H -X -Q -' -# -& -! -@ -$ -: -% -é -É -( -? -+ - diff --git a/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/utils/resources/dict/korean_dict.txt b/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/utils/resources/dict/korean_dict.txt deleted file mode 100644 index a13899f14dfe3bfc25b34904390c7b1e4ed8674b..0000000000000000000000000000000000000000 --- a/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/utils/resources/dict/korean_dict.txt +++ /dev/null @@ -1,3688 +0,0 @@ -! -" -# -$ -% -& -' -* -+ -- -/ -0 -1 -2 -3 -4 -5 -6 -7 -8 -9 -: -; -< -= -> -? -A -B -C -D -E -F -G -H -I -J -K -L -M -N -O -P -Q -R -S -T -U -V -W -X -Y -Z -[ -\ -] -^ -_ -` -a -b -c -d -e -f -g -h -i -j -k -l -m -n -o -p -q -r -s -t -u -v -w -x -y -z -{ -| -} -~ -© -° -² -½ -Á -Ä -Å -Ç -É -Í -Î -Ó -Ö -× -Ü -ß -à -á -â -ã -ä -å -æ -ç -è -é -ê -ë -ì -í -î -ï -ð -ñ -ò -ó -ô -õ -ö -ø -ú -û -ü -ý -ā -ă -ą -ć -Č -č -đ -ē -ė -ę -ě -ğ -ī -İ -ı -Ł -ł -ń -ň -ō -ř -Ş -ş -Š -š -ţ -ū -ź -ż -Ž -ž -Ș -ș -Α -Δ -α -λ -φ -Г -О -а -в -л -о -р -с -т -я -​ -’ -“ -” -→ -∇ -∼ -「 -」 -ア -カ -グ -ニ -ラ -ン -ㄱ -ㄴ -ㄷ -ㄸ -ㄹ -ㅂ -ㅅ -ㅆ -ㅇ -ㅈ -ㅊ -ㅋ -ㅌ -ㅎ -ㅓ -ㅜ -ㅣ -一 -丁 -七 -三 -上 -下 -不 -丑 -世 -丘 -丞 -中 -丸 -丹 -主 -乃 -久 -之 -乎 -乘 -九 -也 -乳 -乾 -事 -二 -云 -互 -五 -井 -亞 -亡 -交 -亥 -亨 -享 -京 -亭 -人 -仁 -今 -他 -仙 -代 -令 -以 -仰 -仲 -件 -任 -企 -伊 -伍 -伎 -伏 -伐 -休 -伯 -伴 -伸 -佃 -佈 -位 -低 -住 -佐 -何 -佛 -作 -使 -來 -供 -依 -侯 -侵 -侶 -便 -俗 -保 -俠 -信 -修 -俱 -俳 -倉 -個 -倍 -倒 -候 -借 -値 -倫 -倭 -假 -偈 -偉 -偏 -停 -偶 -傅 -傑 -傳 -傷 -傾 -像 -僞 -僥 -僧 -價 -儀 -儉 -儒 -優 -儼 -兀 -允 -元 -兆 -先 -光 -克 -兒 -入 -內 -全 -八 -公 -六 -共 -兵 -其 -具 -典 -兼 -再 -冠 -冥 -冶 -准 -凞 -凡 -凱 -出 -函 -刀 -分 -刊 -刑 -列 -初 -判 -別 -利 -到 -制 -券 -刺 -刻 -則 -前 -剛 -副 -創 -劃 -劑 -力 -功 -加 -劣 -助 -劫 -勇 -動 -務 -勝 -勢 -勳 -勸 -匈 -化 -北 -匠 -區 -十 -千 -午 -半 -卍 -卑 -卒 -卓 -南 -博 -卜 -占 -卦 -印 -危 -卵 -卷 -卽 -卿 -厄 -原 -厦 -去 -參 -又 -叉 -友 -反 -叔 -受 -口 -古 -句 -可 -台 -史 -右 -司 -各 -合 -吉 -同 -名 -后 -吏 -吐 -君 -吠 -吳 -呂 -告 -周 -味 -呵 -命 -和 -咳 -咸 -咽 -哀 -品 -哨 -哮 -哲 -唐 -唯 -唱 -商 -問 -啼 -善 -喆 -喉 -喜 -喩 -喪 -嘗 -器 -嚴 -囊 -四 -回 -因 -困 -固 -圈 -國 -圍 -園 -圓 -圖 -團 -土 -在 -地 -均 -坊 -坐 -坑 -坵 -型 -垢 -城 -域 -埴 -執 -培 -基 -堂 -堅 -堆 -堤 -堯 -報 -場 -塔 -塚 -塞 -塵 -境 -墜 -墟 -墨 -墳 -墾 -壁 -壇 -壓 -壤 -士 -壬 -壯 -壺 -壽 -夏 -夕 -外 -多 -夜 -夢 -大 -天 -太 -夫 -央 -失 -夷 -奄 -奇 -奉 -奎 -奏 -契 -奔 -奮 -女 -奴 -好 -如 -妄 -妊 -妖 -妙 -始 -姑 -姓 -姚 -姜 -威 -婆 -婚 -婦 -媒 -媚 -子 -孔 -字 -存 -孝 -孟 -季 -孤 -孫 -學 -孺 -宇 -守 -安 -宋 -宗 -官 -宙 -定 -客 -宣 -室 -宮 -害 -家 -容 -寂 -寃 -寄 -寅 -密 -寇 -富 -寒 -寓 -實 -審 -寫 -寬 -寶 -寸 -寺 -封 -將 -專 -尊 -對 -小 -少 -尙 -尹 -尼 -尿 -局 -居 -屈 -屋 -屍 -屎 -屛 -層 -屬 -山 -岐 -岡 -岩 -岳 -岸 -峙 -峰 -島 -峻 -峽 -崇 -崔 -崖 -崩 -嶋 -巖 -川 -州 -巢 -工 -左 -巧 -巨 -巫 -差 -己 -巷 -市 -布 -帝 -師 -帶 -常 -帽 -幕 -干 -平 -年 -幹 -幻 -幼 -幽 -庇 -序 -店 -府 -度 -座 -庫 -庭 -康 -廟 -廣 -廳 -延 -廷 -建 -廻 -弁 -式 -弑 -弓 -引 -弘 -弟 -弱 -張 -强 -弼 -彌 -彛 -形 -彬 -影 -役 -彼 -彿 -往 -征 -待 -律 -後 -徐 -徑 -得 -從 -循 -微 -德 -徹 -心 -必 -忌 -忍 -志 -忠 -思 -怡 -急 -性 -恐 -恒 -恨 -恩 -悅 -悖 -患 -悲 -情 -惑 -惟 -惠 -惡 -想 -惺 -愁 -意 -愚 -愛 -感 -愼 -慈 -態 -慕 -慣 -慧 -慾 -憂 -憤 -憺 -應 -懸 -戎 -成 -我 -戟 -戮 -戰 -戴 -戶 -房 -所 -手 -才 -打 -批 -承 -技 -抄 -把 -抗 -抱 -抽 -拇 -拓 -拘 -拙 -拜 -拾 -持 -指 -捌 -捨 -捿 -授 -掌 -排 -接 -推 -提 -揚 -揭 -援 -損 -搗 -摩 -播 -操 -擒 -擔 -擘 -據 -擧 -攘 -攝 -攬 -支 -改 -攻 -放 -政 -故 -敍 -敎 -救 -敗 -散 -敬 -整 -數 -文 -斗 -料 -斛 -斜 -斧 -斯 -新 -斷 -方 -於 -施 -旋 -族 -旗 -日 -旨 -早 -旱 -昌 -明 -易 -昔 -星 -春 -昧 -昭 -是 -時 -晉 -晋 -晩 -普 -景 -晴 -晶 -智 -暈 -暑 -暗 -暘 -曉 -曜 -曠 -曦 -曰 -曲 -書 -曹 -曼 -曾 -最 -會 -月 -有 -朋 -服 -望 -朝 -期 -木 -未 -末 -本 -朱 -朴 -李 -材 -村 -杖 -杜 -杞 -杭 -杯 -東 -松 -板 -林 -果 -枝 -枯 -枰 -枾 -柏 -柑 -柱 -栗 -校 -栢 -核 -根 -格 -桀 -桂 -案 -桎 -桑 -桓 -桔 -梁 -梏 -梓 -梗 -條 -梨 -梵 -棗 -棟 -森 -植 -椒 -楊 -楓 -楚 -業 -楮 -極 -榮 -槃 -槍 -樂 -樓 -樗 -樣 -樸 -樹 -樺 -樽 -橄 -橋 -橘 -機 -橡 -檀 -檎 -權 -欌 -欖 -次 -欲 -歌 -歐 -止 -正 -此 -步 -武 -歲 -歸 -死 -殖 -段 -殷 -殺 -殿 -毅 -母 -毒 -比 -毛 -氏 -民 -氣 -水 -永 -求 -汎 -汗 -江 -池 -沅 -沒 -沖 -沙 -沛 -河 -油 -治 -沼 -沿 -泉 -泊 -法 -泗 -泡 -波 -注 -泰 -洋 -洙 -洛 -洞 -津 -洲 -活 -派 -流 -浅 -浦 -浮 -浴 -海 -涅 -涇 -消 -涌 -液 -淑 -淡 -淨 -淫 -深 -淳 -淵 -淸 -渠 -渡 -游 -渾 -湖 -湯 -源 -溪 -溫 -溶 -滄 -滅 -滋 -滯 -滿 -漁 -漆 -漢 -漫 -漸 -潑 -潤 -潭 -澄 -澎 -澤 -澳 -澹 -濁 -濕 -濟 -濤 -濯 -瀋 -瀝 -灣 -火 -灰 -灸 -災 -炎 -炭 -点 -烈 -烏 -烙 -焚 -無 -焦 -然 -煌 -煎 -照 -煬 -煮 -熟 -熱 -燁 -燈 -燔 -燕 -燥 -燧 -燮 -爲 -爵 -父 -片 -版 -牌 -牛 -牝 -牟 -牡 -物 -特 -犧 -犬 -狀 -狗 -猥 -猩 -猪 -獨 -獵 -獸 -獻 -玄 -玉 -王 -玲 -珍 -珠 -珪 -班 -現 -球 -理 -琴 -瑞 -瑟 -瑪 -璃 -璋 -璽 -瓜 -瓦 -甑 -甘 -生 -産 -用 -甫 -田 -由 -甲 -申 -男 -界 -畏 -留 -畜 -畢 -略 -番 -異 -畵 -當 -畸 -疏 -疑 -疫 -疹 -疼 -病 -症 -痔 -痛 -痺 -瘀 -瘍 -瘡 -療 -癌 -癖 -登 -發 -白 -百 -的 -皆 -皇 -皮 -盂 -盆 -益 -盛 -盜 -盟 -盡 -盤 -盧 -目 -直 -相 -省 -看 -眞 -眼 -睡 -督 -瞋 -矢 -矣 -知 -短 -石 -破 -碍 -碑 -磁 -磨 -磬 -示 -社 -祇 -祖 -祝 -神 -祥 -祭 -祺 -禁 -禅 -禍 -福 -禦 -禪 -禮 -禹 -禽 -禾 -秀 -私 -秉 -秋 -科 -秘 -秤 -秦 -秩 -移 -稀 -稗 -種 -稱 -稷 -稼 -稽 -穀 -穆 -積 -空 -窮 -竅 -立 -章 -童 -竭 -端 -竹 -笑 -符 -第 -筆 -等 -筍 -答 -策 -箋 -箕 -管 -箱 -節 -篇 -簡 -米 -粉 -粘 -粥 -精 -糖 -糞 -系 -紀 -紂 -約 -紅 -紋 -純 -紙 -級 -素 -索 -紫 -紬 -累 -細 -紳 -終 -組 -結 -絡 -統 -絲 -絶 -絹 -經 -綠 -維 -綱 -網 -綸 -綽 -緖 -線 -緣 -緯 -縣 -縱 -總 -織 -繡 -繩 -繪 -繭 -纂 -續 -罕 -置 -罰 -羅 -羊 -美 -群 -義 -羽 -翁 -習 -翟 -老 -考 -者 -而 -耐 -耕 -耳 -聃 -聖 -聞 -聰 -聲 -職 -肇 -肉 -肖 -肝 -股 -肥 -育 -肺 -胃 -胎 -胚 -胞 -胡 -胥 -能 -脂 -脈 -脚 -脛 -脣 -脩 -脫 -脯 -脾 -腋 -腎 -腫 -腸 -腹 -膜 -膠 -膨 -膽 -臆 -臟 -臣 -臥 -臨 -自 -至 -致 -臺 -臼 -臾 -與 -興 -舊 -舌 -舍 -舒 -舜 -舟 -般 -船 -艦 -良 -色 -芋 -花 -芳 -芽 -苑 -苔 -苕 -苛 -苞 -若 -苦 -英 -茂 -茵 -茶 -茹 -荀 -荇 -草 -荒 -荷 -莊 -莫 -菊 -菌 -菜 -菩 -菫 -華 -菴 -菽 -萊 -萍 -萬 -落 -葉 -著 -葛 -董 -葬 -蒙 -蒜 -蒲 -蒸 -蒿 -蓮 -蔓 -蔘 -蔡 -蔬 -蕃 -蕉 -蕓 -薄 -薑 -薛 -薩 -薪 -薺 -藏 -藝 -藤 -藥 -藩 -藻 -蘆 -蘇 -蘊 -蘚 -蘭 -虎 -處 -虛 -虞 -虹 -蜀 -蜂 -蜜 -蝕 -蝶 -融 -蟬 -蟲 -蠶 -蠻 -血 -衆 -行 -術 -衛 -衡 -衣 -表 -袁 -裔 -裕 -裙 -補 -製 -複 -襄 -西 -要 -見 -視 -親 -覺 -觀 -角 -解 -言 -訂 -訊 -訓 -託 -記 -訣 -設 -診 -註 -評 -詩 -話 -詵 -誅 -誌 -認 -誕 -語 -誠 -誤 -誥 -誦 -說 -調 -談 -諍 -論 -諡 -諫 -諭 -諸 -謙 -講 -謝 -謠 -證 -識 -譚 -譜 -譯 -議 -護 -讀 -變 -谷 -豆 -豊 -豚 -象 -豪 -豫 -貝 -貞 -財 -貧 -貨 -貪 -貫 -貴 -貸 -費 -資 -賊 -賓 -賞 -賢 -賣 -賦 -質 -贍 -赤 -赫 -走 -起 -超 -越 -趙 -趣 -趨 -足 -趾 -跋 -跡 -路 -踏 -蹟 -身 -躬 -車 -軍 -軒 -軟 -載 -輓 -輕 -輪 -輯 -輸 -輻 -輿 -轅 -轉 -辨 -辭 -辯 -辰 -農 -近 -迦 -述 -追 -逆 -透 -逐 -通 -逝 -造 -逢 -連 -進 -逵 -遂 -遊 -運 -遍 -過 -道 -達 -遠 -遡 -適 -遷 -選 -遺 -遽 -還 -邊 -邑 -那 -邪 -郞 -郡 -部 -都 -鄒 -鄕 -鄭 -鄲 -配 -酒 -酸 -醉 -醫 -醯 -釋 -里 -重 -野 -量 -釐 -金 -針 -鈍 -鈴 -鉞 -銀 -銅 -銘 -鋼 -錄 -錢 -錦 -鎭 -鏡 -鐘 -鐵 -鑑 -鑛 -長 -門 -閃 -開 -間 -閔 -閣 -閥 -閭 -閻 -闕 -關 -阪 -防 -阿 -陀 -降 -限 -陝 -院 -陰 -陳 -陵 -陶 -陸 -陽 -隆 -隊 -隋 -階 -際 -障 -隣 -隨 -隱 -隷 -雀 -雄 -雅 -集 -雇 -雌 -雖 -雙 -雜 -離 -難 -雨 -雪 -雲 -電 -霜 -露 -靈 -靑 -靖 -靜 -非 -面 -革 -靴 -鞏 -韓 -音 -韶 -韻 -順 -須 -頊 -頌 -領 -頭 -顔 -願 -顚 -類 -顯 -風 -飛 -食 -飢 -飮 -飯 -飾 -養 -餓 -餘 -首 -香 -馨 -馬 -駒 -騫 -騷 -驕 -骨 -骸 -髓 -體 -高 -髥 -髮 -鬪 -鬱 -鬼 -魏 -魔 -魚 -魯 -鮮 -鰍 -鰐 -鳥 -鳧 -鳳 -鴨 -鵲 -鶴 -鷄 -鷹 -鹽 -鹿 -麗 -麥 -麻 -黃 -黑 -默 -點 -黨 -鼎 -齊 -齋 -齒 -龍 -龜 -가 -각 -간 -갇 -갈 -갉 -감 -갑 -값 -갓 -갔 -강 -갖 -갗 -같 -갚 -갛 -개 -객 -갠 -갤 -갬 -갭 -갯 -갰 -갱 -갸 -걀 -걔 -걘 -거 -걱 -건 -걷 -걸 -검 -겁 -것 -겄 -겅 -겆 -겉 -겊 -겋 -게 -겐 -겔 -겟 -겠 -겡 -겨 -격 -겪 -견 -결 -겸 -겹 -겻 -겼 -경 -곁 -계 -곕 -곗 -고 -곡 -곤 -곧 -골 -곪 -곬 -곯 -곰 -곱 -곳 -공 -곶 -과 -곽 -관 -괄 -괌 -광 -괘 -괜 -괭 -괴 -괸 -굉 -교 -구 -국 -군 -굳 -굴 -굵 -굶 -굼 -굽 -굿 -궁 -궂 -궈 -권 -궐 -궜 -궝 -궤 -귀 -귄 -귈 -귓 -규 -균 -귤 -그 -극 -근 -글 -긁 -금 -급 -긋 -긍 -기 -긴 -길 -김 -깁 -깃 -깅 -깊 -까 -깍 -깎 -깐 -깔 -깜 -깝 -깟 -깡 -깥 -깨 -깬 -깰 -깻 -깼 -깽 -꺄 -꺼 -꺽 -꺾 -껀 -껄 -껌 -껍 -껏 -껐 -껑 -께 -껴 -꼈 -꼍 -꼐 -꼬 -꼭 -꼴 -꼼 -꼽 -꼿 -꽁 -꽂 -꽃 -꽉 -꽝 -꽤 -꽥 -꾀 -꾜 -꾸 -꾹 -꾼 -꿀 -꿇 -꿈 -꿉 -꿋 -꿍 -꿎 -꿔 -꿨 -꿩 -꿰 -꿴 -뀄 -뀌 -뀐 -뀔 -뀜 -뀝 -끄 -끈 -끊 -끌 -끓 -끔 -끕 -끗 -끙 -끝 -끼 -끽 -낀 -낄 -낌 -낍 -낏 -낑 -나 -낙 -낚 -난 -낟 -날 -낡 -남 -납 -낫 -났 -낭 -낮 -낯 -낱 -낳 -내 -낵 -낸 -낼 -냄 -냅 -냇 -냈 -냉 -냐 -냔 -냘 -냥 -너 -넉 -넋 -넌 -널 -넓 -넘 -넙 -넛 -넜 -넝 -넣 -네 -넥 -넨 -넬 -넴 -넵 -넷 -넸 -넹 -녀 -녁 -년 -념 -녔 -녕 -녘 -녜 -노 -녹 -논 -놀 -놈 -놋 -농 -높 -놓 -놔 -놨 -뇌 -뇨 -뇩 -뇽 -누 -눅 -눈 -눌 -눔 -눕 -눗 -눠 -눴 -뉘 -뉜 -뉩 -뉴 -늄 -늅 -늉 -느 -늑 -는 -늘 -늙 -늠 -늡 -능 -늦 -늪 -늬 -니 -닉 -닌 -닐 -님 -닙 -닛 -닝 -닢 -다 -닥 -닦 -단 -닫 -달 -닭 -닮 -닯 -닳 -담 -답 -닷 -당 -닻 -닿 -대 -댁 -댄 -댈 -댐 -댑 -댓 -댔 -댕 -댜 -더 -덕 -덖 -던 -덜 -덟 -덤 -덥 -덧 -덩 -덫 -덮 -데 -덱 -덴 -델 -뎀 -뎃 -뎅 -뎌 -뎠 -뎨 -도 -독 -돈 -돋 -돌 -돔 -돕 -돗 -동 -돛 -돝 -돼 -됐 -되 -된 -될 -됨 -됩 -됴 -두 -둑 -둔 -둘 -둠 -둡 -둣 -둥 -둬 -뒀 -뒤 -뒬 -뒷 -뒹 -듀 -듈 -듐 -드 -득 -든 -듣 -들 -듦 -듬 -듭 -듯 -등 -듸 -디 -딕 -딘 -딛 -딜 -딤 -딥 -딧 -딨 -딩 -딪 -따 -딱 -딴 -딸 -땀 -땄 -땅 -때 -땐 -땔 -땜 -땝 -땠 -땡 -떠 -떡 -떤 -떨 -떫 -떰 -떱 -떳 -떴 -떵 -떻 -떼 -떽 -뗀 -뗄 -뗍 -뗏 -뗐 -뗑 -또 -똑 -똘 -똥 -뙤 -뚜 -뚝 -뚤 -뚫 -뚱 -뛰 -뛴 -뛸 -뜀 -뜁 -뜨 -뜩 -뜬 -뜯 -뜰 -뜸 -뜻 -띄 -띈 -띌 -띔 -띕 -띠 -띤 -띨 -띱 -띵 -라 -락 -란 -랄 -람 -랍 -랏 -랐 -랑 -랒 -랗 -래 -랙 -랜 -랠 -램 -랩 -랫 -랬 -랭 -랴 -략 -량 -러 -럭 -런 -럴 -럼 -럽 -럿 -렀 -렁 -렇 -레 -렉 -렌 -렐 -렘 -렙 -렛 -렝 -려 -력 -련 -렬 -렴 -렵 -렷 -렸 -령 -례 -로 -록 -론 -롤 -롬 -롭 -롯 -롱 -롸 -롹 -뢰 -뢴 -뢸 -룃 -료 -룐 -룡 -루 -룩 -룬 -룰 -룸 -룹 -룻 -룽 -뤄 -뤘 -뤼 -류 -륙 -륜 -률 -륨 -륭 -르 -륵 -른 -를 -름 -릅 -릇 -릉 -릎 -리 -릭 -린 -릴 -림 -립 -릿 -링 -마 -막 -만 -많 -맏 -말 -맑 -맘 -맙 -맛 -망 -맞 -맡 -맣 -매 -맥 -맨 -맬 -맴 -맵 -맷 -맸 -맹 -맺 -먀 -먁 -머 -먹 -먼 -멀 -멈 -멋 -멍 -멎 -메 -멕 -멘 -멜 -멤 -멥 -멧 -멩 -며 -멱 -면 -멸 -몄 -명 -몇 -모 -목 -몫 -몬 -몰 -몸 -몹 -못 -몽 -뫼 -묘 -무 -묵 -묶 -문 -묻 -물 -묽 -뭄 -뭅 -뭇 -뭉 -뭍 -뭏 -뭐 -뭔 -뭘 -뭡 -뭣 -뮈 -뮌 -뮐 -뮤 -뮬 -므 -믈 -믐 -미 -믹 -민 -믿 -밀 -밈 -밉 -밋 -밌 -밍 -및 -밑 -바 -박 -밖 -반 -받 -발 -밝 -밟 -밤 -밥 -밧 -방 -밭 -배 -백 -밴 -밸 -뱀 -뱁 -뱃 -뱄 -뱅 -뱉 -뱍 -뱐 -버 -벅 -번 -벌 -범 -법 -벗 -벙 -벚 -베 -벡 -벤 -벨 -벰 -벱 -벳 -벵 -벼 -벽 -변 -별 -볍 -볏 -볐 -병 -볕 -보 -복 -볶 -본 -볼 -봄 -봅 -봇 -봉 -봐 -봤 -뵈 -뵐 -뵙 -부 -북 -분 -붇 -불 -붉 -붐 -붓 -붕 -붙 -뷔 -뷰 -뷴 -뷸 -브 -븐 -블 -비 -빅 -빈 -빌 -빔 -빕 -빗 -빙 -빚 -빛 -빠 -빡 -빤 -빨 -빳 -빴 -빵 -빻 -빼 -빽 -뺀 -뺄 -뺌 -뺏 -뺐 -뺑 -뺨 -뻐 -뻑 -뻔 -뻗 -뻘 -뻣 -뻤 -뻥 -뻬 -뼈 -뼉 -뼘 -뽀 -뽈 -뽐 -뽑 -뽕 -뾰 -뿌 -뿍 -뿐 -뿔 -뿜 -쁘 -쁜 -쁠 -쁨 -삐 -삔 -삘 -사 -삭 -삯 -산 -살 -삵 -삶 -삼 -삽 -삿 -샀 -상 -샅 -새 -색 -샌 -샐 -샘 -샙 -샛 -샜 -생 -샤 -샨 -샬 -샴 -샵 -샷 -샹 -서 -석 -섞 -선 -섣 -설 -섬 -섭 -섯 -섰 -성 -섶 -세 -섹 -센 -셀 -셈 -셉 -셋 -셌 -셍 -셔 -션 -셜 -셨 -셰 -셴 -셸 -소 -속 -손 -솔 -솜 -솝 -솟 -송 -솥 -쇄 -쇠 -쇤 -쇳 -쇼 -숀 -숄 -숍 -수 -숙 -순 -숟 -술 -숨 -숩 -숫 -숭 -숯 -숱 -숲 -숴 -쉐 -쉘 -쉬 -쉭 -쉰 -쉴 -쉼 -쉽 -슈 -슐 -슘 -슛 -슝 -스 -슥 -슨 -슬 -슭 -슴 -습 -슷 -승 -시 -식 -신 -싣 -실 -싫 -심 -십 -싯 -싱 -싶 -싸 -싹 -싼 -쌀 -쌈 -쌉 -쌌 -쌍 -쌓 -쌔 -쌘 -쌩 -써 -썩 -썬 -썰 -썸 -썹 -썼 -썽 -쎄 -쎈 -쏘 -쏙 -쏜 -쏟 -쏠 -쏭 -쏴 -쐈 -쐐 -쐬 -쑤 -쑥 -쑨 -쒀 -쒔 -쓰 -쓱 -쓴 -쓸 -씀 -씁 -씌 -씨 -씩 -씬 -씰 -씸 -씹 -씻 -씽 -아 -악 -안 -앉 -않 -알 -앎 -앓 -암 -압 -앗 -았 -앙 -앞 -애 -액 -앤 -앨 -앰 -앱 -앳 -앴 -앵 -야 -약 -얀 -얄 -얇 -얌 -얍 -얏 -양 -얕 -얗 -얘 -얜 -어 -억 -언 -얹 -얻 -얼 -얽 -엄 -업 -없 -엇 -었 -엉 -엊 -엌 -엎 -에 -엑 -엔 -엘 -엠 -엡 -엣 -엥 -여 -역 -엮 -연 -열 -엷 -염 -엽 -엾 -엿 -였 -영 -옅 -옆 -옇 -예 -옌 -옐 -옙 -옛 -오 -옥 -온 -올 -옭 -옮 -옳 -옴 -옵 -옷 -옹 -옻 -와 -왁 -완 -왈 -왑 -왓 -왔 -왕 -왜 -왠 -왱 -외 -왼 -요 -욕 -욘 -욜 -욤 -용 -우 -욱 -운 -울 -움 -웁 -웃 -웅 -워 -웍 -원 -월 -웜 -웠 -웡 -웨 -웬 -웰 -웸 -웹 -위 -윅 -윈 -윌 -윔 -윗 -윙 -유 -육 -윤 -율 -윱 -윳 -융 -으 -윽 -은 -을 -읊 -음 -읍 -응 -의 -읜 -읠 -이 -익 -인 -일 -읽 -잃 -임 -입 -잇 -있 -잉 -잊 -잎 -자 -작 -잔 -잖 -잘 -잠 -잡 -잣 -잤 -장 -잦 -재 -잭 -잰 -잴 -잽 -잿 -쟀 -쟁 -쟈 -쟉 -쟤 -저 -적 -전 -절 -젊 -점 -접 -젓 -정 -젖 -제 -젝 -젠 -젤 -젬 -젭 -젯 -져 -젼 -졀 -졌 -졍 -조 -족 -존 -졸 -좀 -좁 -종 -좇 -좋 -좌 -좍 -좽 -죄 -죠 -죤 -주 -죽 -준 -줄 -줌 -줍 -줏 -중 -줘 -줬 -쥐 -쥔 -쥘 -쥬 -쥴 -즈 -즉 -즌 -즐 -즘 -즙 -증 -지 -직 -진 -짇 -질 -짊 -짐 -집 -짓 -징 -짖 -짙 -짚 -짜 -짝 -짠 -짢 -짤 -짧 -짬 -짭 -짰 -짱 -째 -짹 -짼 -쨀 -쨉 -쨋 -쨌 -쨍 -쩄 -쩌 -쩍 -쩐 -쩔 -쩜 -쩝 -쩡 -쩨 -쪄 -쪘 -쪼 -쪽 -쪾 -쫀 -쫄 -쫑 -쫓 -쫙 -쬐 -쭈 -쭉 -쭐 -쭙 -쯔 -쯤 -쯧 -찌 -찍 -찐 -찔 -찜 -찝 -찡 -찢 -찧 -차 -착 -찬 -찮 -찰 -참 -찹 -찻 -찼 -창 -찾 -채 -책 -챈 -챌 -챔 -챕 -챗 -챘 -챙 -챠 -챤 -처 -척 -천 -철 -첨 -첩 -첫 -청 -체 -첵 -첸 -첼 -쳄 -쳇 -쳉 -쳐 -쳔 -쳤 -초 -촉 -촌 -촘 -촛 -총 -촨 -촬 -최 -쵸 -추 -축 -춘 -출 -춤 -춥 -춧 -충 -춰 -췄 -췌 -취 -췬 -츄 -츠 -측 -츨 -츰 -층 -치 -칙 -친 -칠 -칡 -침 -칩 -칫 -칭 -카 -칵 -칸 -칼 -캄 -캅 -캇 -캉 -캐 -캔 -캘 -캠 -캡 -캣 -캤 -캥 -캬 -커 -컥 -컨 -컫 -컬 -컴 -컵 -컷 -컸 -컹 -케 -켄 -켈 -켐 -켓 -켕 -켜 -켠 -켤 -켭 -켯 -켰 -코 -콕 -콘 -콜 -콤 -콥 -콧 -콩 -콰 -콱 -콴 -콸 -쾅 -쾌 -쾡 -쾨 -쾰 -쿄 -쿠 -쿡 -쿤 -쿨 -쿰 -쿵 -쿼 -퀀 -퀄 -퀘 -퀭 -퀴 -퀵 -퀸 -퀼 -큐 -큘 -크 -큰 -클 -큼 -큽 -키 -킥 -킨 -킬 -킴 -킵 -킷 -킹 -타 -탁 -탄 -탈 -탉 -탐 -탑 -탓 -탔 -탕 -태 -택 -탠 -탤 -탬 -탭 -탯 -탰 -탱 -터 -턱 -턴 -털 -텀 -텁 -텃 -텄 -텅 -테 -텍 -텐 -텔 -템 -텝 -텡 -텨 -톈 -토 -톡 -톤 -톨 -톰 -톱 -톳 -통 -퇴 -툇 -투 -툭 -툰 -툴 -툼 -퉁 -퉈 -퉜 -튀 -튄 -튈 -튕 -튜 -튠 -튤 -튬 -트 -특 -튼 -튿 -틀 -틈 -틉 -틋 -틔 -티 -틱 -틴 -틸 -팀 -팁 -팅 -파 -팍 -팎 -판 -팔 -팜 -팝 -팟 -팠 -팡 -팥 -패 -팩 -팬 -팰 -팸 -팻 -팼 -팽 -퍼 -퍽 -펀 -펄 -펌 -펍 -펐 -펑 -페 -펙 -펜 -펠 -펨 -펩 -펫 -펭 -펴 -편 -펼 -폄 -폈 -평 -폐 -포 -폭 -폰 -폴 -폼 -폿 -퐁 -표 -푭 -푸 -푹 -푼 -풀 -품 -풋 -풍 -퓨 -퓬 -퓰 -퓸 -프 -픈 -플 -픔 -픕 -피 -픽 -핀 -필 -핌 -핍 -핏 -핑 -하 -학 -한 -할 -핥 -함 -합 -핫 -항 -해 -핵 -핸 -핼 -햄 -햅 -햇 -했 -행 -햐 -향 -헀 -허 -헉 -헌 -헐 -험 -헙 -헛 -헝 -헤 -헥 -헨 -헬 -헴 -헵 -헷 -헹 -혀 -혁 -현 -혈 -혐 -협 -혓 -혔 -형 -혜 -호 -혹 -혼 -홀 -홈 -홉 -홋 -홍 -홑 -화 -확 -환 -활 -홧 -황 -홰 -홱 -횃 -회 -획 -횝 -횟 -횡 -효 -후 -훅 -훈 -훌 -훑 -훔 -훗 -훤 -훨 -훼 -휄 -휑 -휘 -휙 -휜 -휠 -휩 -휭 -휴 -휼 -흄 -흉 -흐 -흑 -흔 -흘 -흙 -흠 -흡 -흣 -흥 -흩 -희 -흰 -흽 -히 -힉 -힌 -힐 -힘 -힙 -힝 -車 -滑 -金 -奈 -羅 -洛 -卵 -欄 -蘭 -郎 -來 -盧 -老 -魯 -綠 -鹿 -論 -雷 -樓 -縷 -凌 -樂 -不 -參 -葉 -沈 -若 -兩 -凉 -梁 -呂 -女 -廬 -麗 -黎 -曆 -歷 -戀 -蓮 -連 -列 -烈 -裂 -念 -獵 -靈 -領 -例 -禮 -醴 -惡 -尿 -料 -遼 -龍 -暈 -柳 -流 -類 -六 -陸 -倫 -律 -栗 -利 -李 -梨 -理 -離 -燐 -林 -臨 -立 -茶 -切 -宅 - diff --git a/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/utils/resources/dict/latin_dict.txt b/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/utils/resources/dict/latin_dict.txt deleted file mode 100644 index e166bf33ecfbdc90ddb3d9743fded23306acabd5..0000000000000000000000000000000000000000 --- a/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/utils/resources/dict/latin_dict.txt +++ /dev/null @@ -1,185 +0,0 @@ - -! -" -# -$ -% -& -' -( -) -* -+ -, -- -. -/ -0 -1 -2 -3 -4 -5 -6 -7 -8 -9 -: -; -< -= -> -? -@ -A -B -C -D -E -F -G -H -I -J -K -L -M -N -O -P -Q -R -S -T -U -V -W -X -Y -Z -[ -] -_ -` -a -b -c -d -e -f -g -h -i -j -k -l -m -n -o -p -q -r -s -t -u -v -w -x -y -z -{ -} -¡ -£ -§ -ª -« -­ -° -² -³ -´ -µ -· -º -» -¿ -À -Á - -Ä -Å -Ç -È -É -Ê -Ë -Ì -Í -Î -Ï -Ò -Ó -Ô -Õ -Ö -Ú -Ü -Ý -ß -à -á -â -ã -ä -å -æ -ç -è -é -ê -ë -ì -í -î -ï -ñ -ò -ó -ô -õ -ö -ø -ù -ú -û -ü -ý -ą -Ć -ć -Č -č -Đ -đ -ę -ı -Ł -ł -ō -Œ -œ -Š -š -Ÿ -Ž -ž -ʒ -β -δ -ε -з -Ṡ -‘ -€ -™ diff --git a/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/utils/resources/dict/ppocr_keys_v1.txt b/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/utils/resources/dict/ppocr_keys_v1.txt deleted file mode 100644 index 84b885d8352226e49b1d5d791b8f43a663e246aa..0000000000000000000000000000000000000000 --- a/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/utils/resources/dict/ppocr_keys_v1.txt +++ /dev/null @@ -1,6623 +0,0 @@ -' -疗 -绚 -诚 -娇 -溜 -题 -贿 -者 -廖 -更 -纳 -加 -奉 -公 -一 -就 -汴 -计 -与 -路 -房 -原 -妇 -2 -0 -8 -- -7 -其 -> -: -] -, -, -骑 -刈 -全 -消 -昏 -傈 -安 -久 -钟 -嗅 -不 -影 -处 -驽 -蜿 -资 -关 -椤 -地 -瘸 -专 -问 -忖 -票 -嫉 -炎 -韵 -要 -月 -田 -节 -陂 -鄙 -捌 -备 -拳 -伺 -眼 -网 -盎 -大 -傍 -心 -东 -愉 -汇 -蹿 -科 -每 -业 -里 -航 -晏 -字 -平 -录 -先 -1 -3 -彤 -鲶 -产 -稍 -督 -腴 -有 -象 -岳 -注 -绍 -在 -泺 -文 -定 -核 -名 -水 -过 -理 -让 -偷 -率 -等 -这 -发 -” -为 -含 -肥 -酉 -相 -鄱 -七 -编 -猥 -锛 -日 -镀 -蒂 -掰 -倒 -辆 -栾 -栗 -综 -涩 -州 -雌 -滑 -馀 -了 -机 -块 -司 -宰 -甙 -兴 -矽 -抚 -保 -用 -沧 -秩 -如 -收 -息 -滥 -页 -疑 -埠 -! -! -姥 -异 -橹 -钇 -向 -下 -跄 -的 -椴 -沫 -国 -绥 -獠 -报 -开 -民 -蜇 -何 -分 -凇 -长 -讥 -藏 -掏 -施 -羽 -中 -讲 -派 -嘟 -人 -提 -浼 -间 -世 -而 -古 -多 -倪 -唇 -饯 -控 -庚 -首 -赛 -蜓 -味 -断 -制 -觉 -技 -替 -艰 -溢 -潮 -夕 -钺 -外 -摘 -枋 -动 -双 -单 -啮 -户 -枇 -确 -锦 -曜 -杜 -或 -能 -效 -霜 -盒 -然 -侗 -电 -晁 -放 -步 -鹃 -新 -杖 -蜂 -吒 -濂 -瞬 -评 -总 -隍 -对 -独 -合 -也 -是 -府 -青 -天 -诲 -墙 -组 -滴 -级 -邀 -帘 -示 -已 -时 -骸 -仄 -泅 -和 -遨 -店 -雇 -疫 -持 -巍 -踮 -境 -只 -亨 -目 -鉴 -崤 -闲 -体 -泄 -杂 -作 -般 -轰 -化 -解 -迂 -诿 -蛭 -璀 -腾 -告 -版 -服 -省 -师 -小 -规 -程 -线 -海 -办 -引 -二 -桧 -牌 -砺 -洄 -裴 -修 -图 -痫 -胡 -许 -犊 -事 -郛 -基 -柴 -呼 -食 -研 -奶 -律 -蛋 -因 -葆 -察 -戏 -褒 -戒 -再 -李 -骁 -工 -貂 -油 -鹅 -章 -啄 -休 -场 -给 -睡 -纷 -豆 -器 -捎 -说 -敏 -学 -会 -浒 -设 -诊 -格 -廓 -查 -来 -霓 -室 -溆 -¢ -诡 -寥 -焕 -舜 -柒 -狐 -回 -戟 -砾 -厄 -实 -翩 -尿 -五 -入 -径 -惭 -喹 -股 -宇 -篝 -| -; -美 -期 -云 -九 -祺 -扮 -靠 -锝 -槌 -系 -企 -酰 -阊 -暂 -蚕 -忻 -豁 -本 -羹 -执 -条 -钦 -H -獒 -限 -进 -季 -楦 -于 -芘 -玖 -铋 -茯 -未 -答 -粘 -括 -样 -精 -欠 -矢 -甥 -帷 -嵩 -扣 -令 -仔 -风 -皈 -行 -支 -部 -蓉 -刮 -站 -蜡 -救 -钊 -汗 -松 -嫌 -成 -可 -. -鹤 -院 -从 -交 -政 -怕 -活 -调 -球 -局 -验 -髌 -第 -韫 -谗 -串 -到 -圆 -年 -米 -/ -* -友 -忿 -检 -区 -看 -自 -敢 -刃 -个 -兹 -弄 -流 -留 -同 -没 -齿 -星 -聆 -轼 -湖 -什 -三 -建 -蛔 -儿 -椋 -汕 -震 -颧 -鲤 -跟 -力 -情 -璺 -铨 -陪 -务 -指 -族 -训 -滦 -鄣 -濮 -扒 -商 -箱 -十 -召 -慷 -辗 -所 -莞 -管 -护 -臭 -横 -硒 -嗓 -接 -侦 -六 -露 -党 -馋 -驾 -剖 -高 -侬 -妪 -幂 -猗 -绺 -骐 -央 -酐 -孝 -筝 -课 -徇 -缰 -门 -男 -西 -项 -句 -谙 -瞒 -秃 -篇 -教 -碲 -罚 -声 -呐 -景 -前 -富 -嘴 -鳌 -稀 -免 -朋 -啬 -睐 -去 -赈 -鱼 -住 -肩 -愕 -速 -旁 -波 -厅 -健 -茼 -厥 -鲟 -谅 -投 -攸 -炔 -数 -方 -击 -呋 -谈 -绩 -别 -愫 -僚 -躬 -鹧 -胪 -炳 -招 -喇 -膨 -泵 -蹦 -毛 -结 -5 -4 -谱 -识 -陕 -粽 -婚 -拟 -构 -且 -搜 -任 -潘 -比 -郢 -妨 -醪 -陀 -桔 -碘 -扎 -选 -哈 -骷 -楷 -亿 -明 -缆 -脯 -监 -睫 -逻 -婵 -共 -赴 -淝 -凡 -惦 -及 -达 -揖 -谩 -澹 -减 -焰 -蛹 -番 -祁 -柏 -员 -禄 -怡 -峤 -龙 -白 -叽 -生 -闯 -起 -细 -装 -谕 -竟 -聚 -钙 -上 -导 -渊 -按 -艾 -辘 -挡 -耒 -盹 -饪 -臀 -记 -邮 -蕙 -受 -各 -医 -搂 -普 -滇 -朗 -茸 -带 -翻 -酚 -( -光 -堤 -墟 -蔷 -万 -幻 -〓 -瑙 -辈 -昧 -盏 -亘 -蛀 -吉 -铰 -请 -子 -假 -闻 -税 -井 -诩 -哨 -嫂 -好 -面 -琐 -校 -馊 -鬣 -缂 -营 -访 -炖 -占 -农 -缀 -否 -经 -钚 -棵 -趟 -张 -亟 -吏 -茶 -谨 -捻 -论 -迸 -堂 -玉 -信 -吧 -瞠 -乡 -姬 -寺 -咬 -溏 -苄 -皿 -意 -赉 -宝 -尔 -钰 -艺 -特 -唳 -踉 -都 -荣 -倚 -登 -荐 -丧 -奇 -涵 -批 -炭 -近 -符 -傩 -感 -道 -着 -菊 -虹 -仲 -众 -懈 -濯 -颞 -眺 -南 -释 -北 -缝 -标 -既 -茗 -整 -撼 -迤 -贲 -挎 -耱 -拒 -某 -妍 -卫 -哇 -英 -矶 -藩 -治 -他 -元 -领 -膜 -遮 -穗 -蛾 -飞 -荒 -棺 -劫 -么 -市 -火 -温 -拈 -棚 -洼 -转 -果 -奕 -卸 -迪 -伸 -泳 -斗 -邡 -侄 -涨 -屯 -萋 -胭 -氡 -崮 -枞 -惧 -冒 -彩 -斜 -手 -豚 -随 -旭 -淑 -妞 -形 -菌 -吲 -沱 -争 -驯 -歹 -挟 -兆 -柱 -传 -至 -包 -内 -响 -临 -红 -功 -弩 -衡 -寂 -禁 -老 -棍 -耆 -渍 -织 -害 -氵 -渑 -布 -载 -靥 -嗬 -虽 -苹 -咨 -娄 -库 -雉 -榜 -帜 -嘲 -套 -瑚 -亲 -簸 -欧 -边 -6 -腿 -旮 -抛 -吹 -瞳 -得 -镓 -梗 -厨 -继 -漾 -愣 -憨 -士 -策 -窑 -抑 -躯 -襟 -脏 -参 -贸 -言 -干 -绸 -鳄 -穷 -藜 -音 -折 -详 -) -举 -悍 -甸 -癌 -黎 -谴 -死 -罩 -迁 -寒 -驷 -袖 -媒 -蒋 -掘 -模 -纠 -恣 -观 -祖 -蛆 -碍 -位 -稿 -主 -澧 -跌 -筏 -京 -锏 -帝 -贴 -证 -糠 -才 -黄 -鲸 -略 -炯 -饱 -四 -出 -园 -犀 -牧 -容 -汉 -杆 -浈 -汰 -瑷 -造 -虫 -瘩 -怪 -驴 -济 -应 -花 -沣 -谔 -夙 -旅 -价 -矿 -以 -考 -s -u -呦 -晒 -巡 -茅 -准 -肟 -瓴 -詹 -仟 -褂 -译 -桌 -混 -宁 -怦 -郑 -抿 -些 -余 -鄂 -饴 -攒 -珑 -群 -阖 -岔 -琨 -藓 -预 -环 -洮 -岌 -宀 -杲 -瀵 -最 -常 -囡 -周 -踊 -女 -鼓 -袭 -喉 -简 -范 -薯 -遐 -疏 -粱 -黜 -禧 -法 -箔 -斤 -遥 -汝 -奥 -直 -贞 -撑 -置 -绱 -集 -她 -馅 -逗 -钧 -橱 -魉 -[ -恙 -躁 -唤 -9 -旺 -膘 -待 -脾 -惫 -购 -吗 -依 -盲 -度 -瘿 -蠖 -俾 -之 -镗 -拇 -鲵 -厝 -簧 -续 -款 -展 -啃 -表 -剔 -品 -钻 -腭 -损 -清 -锶 -统 -涌 -寸 -滨 -贪 -链 -吠 -冈 -伎 -迥 -咏 -吁 -览 -防 -迅 -失 -汾 -阔 -逵 -绀 -蔑 -列 -川 -凭 -努 -熨 -揪 -利 -俱 -绉 -抢 -鸨 -我 -即 -责 -膦 -易 -毓 -鹊 -刹 -玷 -岿 -空 -嘞 -绊 -排 -术 -估 -锷 -违 -们 -苟 -铜 -播 -肘 -件 -烫 -审 -鲂 -广 -像 -铌 -惰 -铟 -巳 -胍 -鲍 -康 -憧 -色 -恢 -想 -拷 -尤 -疳 -知 -S -Y -F -D -A -峄 -裕 -帮 -握 -搔 -氐 -氘 -难 -墒 -沮 -雨 -叁 -缥 -悴 -藐 -湫 -娟 -苑 -稠 -颛 -簇 -后 -阕 -闭 -蕤 -缚 -怎 -佞 -码 -嘤 -蔡 -痊 -舱 -螯 -帕 -赫 -昵 -升 -烬 -岫 -、 -疵 -蜻 -髁 -蕨 -隶 -烛 -械 -丑 -盂 -梁 -强 -鲛 -由 -拘 -揉 -劭 -龟 -撤 -钩 -呕 -孛 -费 -妻 -漂 -求 -阑 -崖 -秤 -甘 -通 -深 -补 -赃 -坎 -床 -啪 -承 -吼 -量 -暇 -钼 -烨 -阂 -擎 -脱 -逮 -称 -P -神 -属 -矗 -华 -届 -狍 -葑 -汹 -育 -患 -窒 -蛰 -佼 -静 -槎 -运 -鳗 -庆 -逝 -曼 -疱 -克 -代 -官 -此 -麸 -耧 -蚌 -晟 -例 -础 -榛 -副 -测 -唰 -缢 -迹 -灬 -霁 -身 -岁 -赭 -扛 -又 -菡 -乜 -雾 -板 -读 -陷 -徉 -贯 -郁 -虑 -变 -钓 -菜 -圾 -现 -琢 -式 -乐 -维 -渔 -浜 -左 -吾 -脑 -钡 -警 -T -啵 -拴 -偌 -漱 -湿 -硕 -止 -骼 -魄 -积 -燥 -联 -踢 -玛 -则 -窿 -见 -振 -畿 -送 -班 -钽 -您 -赵 -刨 -印 -讨 -踝 -籍 -谡 -舌 -崧 -汽 -蔽 -沪 -酥 -绒 -怖 -财 -帖 -肱 -私 -莎 -勋 -羔 -霸 -励 -哼 -帐 -将 -帅 -渠 -纪 -婴 -娩 -岭 -厘 -滕 -吻 -伤 -坝 -冠 -戊 -隆 -瘁 -介 -涧 -物 -黍 -并 -姗 -奢 -蹑 -掣 -垸 -锴 -命 -箍 -捉 -病 -辖 -琰 -眭 -迩 -艘 -绌 -繁 -寅 -若 -毋 -思 -诉 -类 -诈 -燮 -轲 -酮 -狂 -重 -反 -职 -筱 -县 -委 -磕 -绣 -奖 -晋 -濉 -志 -徽 -肠 -呈 -獐 -坻 -口 -片 -碰 -几 -村 -柿 -劳 -料 -获 -亩 -惕 -晕 -厌 -号 -罢 -池 -正 -鏖 -煨 -家 -棕 -复 -尝 -懋 -蜥 -锅 -岛 -扰 -队 -坠 -瘾 -钬 -@ -卧 -疣 -镇 -譬 -冰 -彷 -频 -黯 -据 -垄 -采 -八 -缪 -瘫 -型 -熹 -砰 -楠 -襁 -箐 -但 -嘶 -绳 -啤 -拍 -盥 -穆 -傲 -洗 -盯 -塘 -怔 -筛 -丿 -台 -恒 -喂 -葛 -永 -¥ -烟 -酒 -桦 -书 -砂 -蚝 -缉 -态 -瀚 -袄 -圳 -轻 -蛛 -超 -榧 -遛 -姒 -奘 -铮 -右 -荽 -望 -偻 -卡 -丶 -氰 -附 -做 -革 -索 -戚 -坨 -桷 -唁 -垅 -榻 -岐 -偎 -坛 -莨 -山 -殊 -微 -骇 -陈 -爨 -推 -嗝 -驹 -澡 -藁 -呤 -卤 -嘻 -糅 -逛 -侵 -郓 -酌 -德 -摇 -※ -鬃 -被 -慨 -殡 -羸 -昌 -泡 -戛 -鞋 -河 -宪 -沿 -玲 -鲨 -翅 -哽 -源 -铅 -语 -照 -邯 -址 -荃 -佬 -顺 -鸳 -町 -霭 -睾 -瓢 -夸 -椁 -晓 -酿 -痈 -咔 -侏 -券 -噎 -湍 -签 -嚷 -离 -午 -尚 -社 -锤 -背 -孟 -使 -浪 -缦 -潍 -鞅 -军 -姹 -驶 -笑 -鳟 -鲁 -》 -孽 -钜 -绿 -洱 -礴 -焯 -椰 -颖 -囔 -乌 -孔 -巴 -互 -性 -椽 -哞 -聘 -昨 -早 -暮 -胶 -炀 -隧 -低 -彗 -昝 -铁 -呓 -氽 -藉 -喔 -癖 -瑗 -姨 -权 -胱 -韦 -堑 -蜜 -酋 -楝 -砝 -毁 -靓 -歙 -锲 -究 -屋 -喳 -骨 -辨 -碑 -武 -鸠 -宫 -辜 -烊 -适 -坡 -殃 -培 -佩 -供 -走 -蜈 -迟 -翼 -况 -姣 -凛 -浔 -吃 -飘 -债 -犟 -金 -促 -苛 -崇 -坂 -莳 -畔 -绂 -兵 -蠕 -斋 -根 -砍 -亢 -欢 -恬 -崔 -剁 -餐 -榫 -快 -扶 -‖ -濒 -缠 -鳜 -当 -彭 -驭 -浦 -篮 -昀 -锆 -秸 -钳 -弋 -娣 -瞑 -夷 -龛 -苫 -拱 -致 -% -嵊 -障 -隐 -弑 -初 -娓 -抉 -汩 -累 -蓖 -" -唬 -助 -苓 -昙 -押 -毙 -破 -城 -郧 -逢 -嚏 -獭 -瞻 -溱 -婿 -赊 -跨 -恼 -璧 -萃 -姻 -貉 -灵 -炉 -密 -氛 -陶 -砸 -谬 -衔 -点 -琛 -沛 -枳 -层 -岱 -诺 -脍 -榈 -埂 -征 -冷 -裁 -打 -蹴 -素 -瘘 -逞 -蛐 -聊 -激 -腱 -萘 -踵 -飒 -蓟 -吆 -取 -咙 -簋 -涓 -矩 -曝 -挺 -揣 -座 -你 -史 -舵 -焱 -尘 -苏 -笈 -脚 -溉 -榨 -诵 -樊 -邓 -焊 -义 -庶 -儋 -蟋 -蒲 -赦 -呷 -杞 -诠 -豪 -还 -试 -颓 -茉 -太 -除 -紫 -逃 -痴 -草 -充 -鳕 -珉 -祗 -墨 -渭 -烩 -蘸 -慕 -璇 -镶 -穴 -嵘 -恶 -骂 -险 -绋 -幕 -碉 -肺 -戳 -刘 -潞 -秣 -纾 -潜 -銮 -洛 -须 -罘 -销 -瘪 -汞 -兮 -屉 -r -林 -厕 -质 -探 -划 -狸 -殚 -善 -煊 -烹 -〒 -锈 -逯 -宸 -辍 -泱 -柚 -袍 -远 -蹋 -嶙 -绝 -峥 -娥 -缍 -雀 -徵 -认 -镱 -谷 -= -贩 -勉 -撩 -鄯 -斐 -洋 -非 -祚 -泾 -诒 -饿 -撬 -威 -晷 -搭 -芍 -锥 -笺 -蓦 -候 -琊 -档 -礁 -沼 -卵 -荠 -忑 -朝 -凹 -瑞 -头 -仪 -弧 -孵 -畏 -铆 -突 -衲 -车 -浩 -气 -茂 -悖 -厢 -枕 -酝 -戴 -湾 -邹 -飚 -攘 -锂 -写 -宵 -翁 -岷 -无 -喜 -丈 -挑 -嗟 -绛 -殉 -议 -槽 -具 -醇 -淞 -笃 -郴 -阅 -饼 -底 -壕 -砚 -弈 -询 -缕 -庹 -翟 -零 -筷 -暨 -舟 -闺 -甯 -撞 -麂 -茌 -蔼 -很 -珲 -捕 -棠 -角 -阉 -媛 -娲 -诽 -剿 -尉 -爵 -睬 -韩 -诰 -匣 -危 -糍 -镯 -立 -浏 -阳 -少 -盆 -舔 -擘 -匪 -申 -尬 -铣 -旯 -抖 -赘 -瓯 -居 -ˇ -哮 -游 -锭 -茏 -歌 -坏 -甚 -秒 -舞 -沙 -仗 -劲 -潺 -阿 -燧 -郭 -嗖 -霏 -忠 -材 -奂 -耐 -跺 -砀 -输 -岖 -媳 -氟 -极 -摆 -灿 -今 -扔 -腻 -枝 -奎 -药 -熄 -吨 -话 -q -额 -慑 -嘌 -协 -喀 -壳 -埭 -视 -著 -於 -愧 -陲 -翌 -峁 -颅 -佛 -腹 -聋 -侯 -咎 -叟 -秀 -颇 -存 -较 -罪 -哄 -岗 -扫 -栏 -钾 -羌 -己 -璨 -枭 -霉 -煌 -涸 -衿 -键 -镝 -益 -岢 -奏 -连 -夯 -睿 -冥 -均 -糖 -狞 -蹊 -稻 -爸 -刿 -胥 -煜 -丽 -肿 -璃 -掸 -跚 -灾 -垂 -樾 -濑 -乎 -莲 -窄 -犹 -撮 -战 -馄 -软 -络 -显 -鸢 -胸 -宾 -妲 -恕 -埔 -蝌 -份 -遇 -巧 -瞟 -粒 -恰 -剥 -桡 -博 -讯 -凯 -堇 -阶 -滤 -卖 -斌 -骚 -彬 -兑 -磺 -樱 -舷 -两 -娱 -福 -仃 -差 -找 -桁 -÷ -净 -把 -阴 -污 -戬 -雷 -碓 -蕲 -楚 -罡 -焖 -抽 -妫 -咒 -仑 -闱 -尽 -邑 -菁 -爱 -贷 -沥 -鞑 -牡 -嗉 -崴 -骤 -塌 -嗦 -订 -拮 -滓 -捡 -锻 -次 -坪 -杩 -臃 -箬 -融 -珂 -鹗 -宗 -枚 -降 -鸬 -妯 -阄 -堰 -盐 -毅 -必 -杨 -崃 -俺 -甬 -状 -莘 -货 -耸 -菱 -腼 -铸 -唏 -痤 -孚 -澳 -懒 -溅 -翘 -疙 -杷 -淼 -缙 -骰 -喊 -悉 -砻 -坷 -艇 -赁 -界 -谤 -纣 -宴 -晃 -茹 -归 -饭 -梢 -铡 -街 -抄 -肼 -鬟 -苯 -颂 -撷 -戈 -炒 -咆 -茭 -瘙 -负 -仰 -客 -琉 -铢 -封 -卑 -珥 -椿 -镧 -窨 -鬲 -寿 -御 -袤 -铃 -萎 -砖 -餮 -脒 -裳 -肪 -孕 -嫣 -馗 -嵇 -恳 -氯 -江 -石 -褶 -冢 -祸 -阻 -狈 -羞 -银 -靳 -透 -咳 -叼 -敷 -芷 -啥 -它 -瓤 -兰 -痘 -懊 -逑 -肌 -往 -捺 -坊 -甩 -呻 -〃 -沦 -忘 -膻 -祟 -菅 -剧 -崆 -智 -坯 -臧 -霍 -墅 -攻 -眯 -倘 -拢 -骠 -铐 -庭 -岙 -瓠 -′ -缺 -泥 -迢 -捶 -? -? -郏 -喙 -掷 -沌 -纯 -秘 -种 -听 -绘 -固 -螨 -团 -香 -盗 -妒 -埚 -蓝 -拖 -旱 -荞 -铀 -血 -遏 -汲 -辰 -叩 -拽 -幅 -硬 -惶 -桀 -漠 -措 -泼 -唑 -齐 -肾 -念 -酱 -虚 -屁 -耶 -旗 -砦 -闵 -婉 -馆 -拭 -绅 -韧 -忏 -窝 -醋 -葺 -顾 -辞 -倜 -堆 -辋 -逆 -玟 -贱 -疾 -董 -惘 -倌 -锕 -淘 -嘀 -莽 -俭 -笏 -绑 -鲷 -杈 -择 -蟀 -粥 -嗯 -驰 -逾 -案 -谪 -褓 -胫 -哩 -昕 -颚 -鲢 -绠 -躺 -鹄 -崂 -儒 -俨 -丝 -尕 -泌 -啊 -萸 -彰 -幺 -吟 -骄 -苣 -弦 -脊 -瑰 -〈 -诛 -镁 -析 -闪 -剪 -侧 -哟 -框 -螃 -守 -嬗 -燕 -狭 -铈 -缮 -概 -迳 -痧 -鲲 -俯 -售 -笼 -痣 -扉 -挖 -满 -咋 -援 -邱 -扇 -歪 -便 -玑 -绦 -峡 -蛇 -叨 -〖 -泽 -胃 -斓 -喋 -怂 -坟 -猪 -该 -蚬 -炕 -弥 -赞 -棣 -晔 -娠 -挲 -狡 -创 -疖 -铕 -镭 -稷 -挫 -弭 -啾 -翔 -粉 -履 -苘 -哦 -楼 -秕 -铂 -土 -锣 -瘟 -挣 -栉 -习 -享 -桢 -袅 -磨 -桂 -谦 -延 -坚 -蔚 -噗 -署 -谟 -猬 -钎 -恐 -嬉 -雒 -倦 -衅 -亏 -璩 -睹 -刻 -殿 -王 -算 -雕 -麻 -丘 -柯 -骆 -丸 -塍 -谚 -添 -鲈 -垓 -桎 -蚯 -芥 -予 -飕 -镦 -谌 -窗 -醚 -菀 -亮 -搪 -莺 -蒿 -羁 -足 -J -真 -轶 -悬 -衷 -靛 -翊 -掩 -哒 -炅 -掐 -冼 -妮 -l -谐 -稚 -荆 -擒 -犯 -陵 -虏 -浓 -崽 -刍 -陌 -傻 -孜 -千 -靖 -演 -矜 -钕 -煽 -杰 -酗 -渗 -伞 -栋 -俗 -泫 -戍 -罕 -沾 -疽 -灏 -煦 -芬 -磴 -叱 -阱 -榉 -湃 -蜀 -叉 -醒 -彪 -租 -郡 -篷 -屎 -良 -垢 -隗 -弱 -陨 -峪 -砷 -掴 -颁 -胎 -雯 -绵 -贬 -沐 -撵 -隘 -篙 -暖 -曹 -陡 -栓 -填 -臼 -彦 -瓶 -琪 -潼 -哪 -鸡 -摩 -啦 -俟 -锋 -域 -耻 -蔫 -疯 -纹 -撇 -毒 -绶 -痛 -酯 -忍 -爪 -赳 -歆 -嘹 -辕 -烈 -册 -朴 -钱 -吮 -毯 -癜 -娃 -谀 -邵 -厮 -炽 -璞 -邃 -丐 -追 -词 -瓒 -忆 -轧 -芫 -谯 -喷 -弟 -半 -冕 -裙 -掖 -墉 -绮 -寝 -苔 -势 -顷 -褥 -切 -衮 -君 -佳 -嫒 -蚩 -霞 -佚 -洙 -逊 -镖 -暹 -唛 -& -殒 -顶 -碗 -獗 -轭 -铺 -蛊 -废 -恹 -汨 -崩 -珍 -那 -杵 -曲 -纺 -夏 -薰 -傀 -闳 -淬 -姘 -舀 -拧 -卷 -楂 -恍 -讪 -厩 -寮 -篪 -赓 -乘 -灭 -盅 -鞣 -沟 -慎 -挂 -饺 -鼾 -杳 -树 -缨 -丛 -絮 -娌 -臻 -嗳 -篡 -侩 -述 -衰 -矛 -圈 -蚜 -匕 -筹 -匿 -濞 -晨 -叶 -骋 -郝 -挚 -蚴 -滞 -增 -侍 -描 -瓣 -吖 -嫦 -蟒 -匾 -圣 -赌 -毡 -癞 -恺 -百 -曳 -需 -篓 -肮 -庖 -帏 -卿 -驿 -遗 -蹬 -鬓 -骡 -歉 -芎 -胳 -屐 -禽 -烦 -晌 -寄 -媾 -狄 -翡 -苒 -船 -廉 -终 -痞 -殇 -々 -畦 -饶 -改 -拆 -悻 -萄 -£ -瓿 -乃 -訾 -桅 -匮 -溧 -拥 -纱 -铍 -骗 -蕃 -龋 -缬 -父 -佐 -疚 -栎 -醍 -掳 -蓄 -x -惆 -颜 -鲆 -榆 -〔 -猎 -敌 -暴 -谥 -鲫 -贾 -罗 -玻 -缄 -扦 -芪 -癣 -落 -徒 -臾 -恿 -猩 -托 -邴 -肄 -牵 -春 -陛 -耀 -刊 -拓 -蓓 -邳 -堕 -寇 -枉 -淌 -啡 -湄 -兽 -酷 -萼 -碚 -濠 -萤 -夹 -旬 -戮 -梭 -琥 -椭 -昔 -勺 -蜊 -绐 -晚 -孺 -僵 -宣 -摄 -冽 -旨 -萌 -忙 -蚤 -眉 -噼 -蟑 -付 -契 -瓜 -悼 -颡 -壁 -曾 -窕 -颢 -澎 -仿 -俑 -浑 -嵌 -浣 -乍 -碌 -褪 -乱 -蔟 -隙 -玩 -剐 -葫 -箫 -纲 -围 -伐 -决 -伙 -漩 -瑟 -刑 -肓 -镳 -缓 -蹭 -氨 -皓 -典 -畲 -坍 -铑 -檐 -塑 -洞 -倬 -储 -胴 -淳 -戾 -吐 -灼 -惺 -妙 -毕 -珐 -缈 -虱 -盖 -羰 -鸿 -磅 -谓 -髅 -娴 -苴 -唷 -蚣 -霹 -抨 -贤 -唠 -犬 -誓 -逍 -庠 -逼 -麓 -籼 -釉 -呜 -碧 -秧 -氩 -摔 -霄 -穸 -纨 -辟 -妈 -映 -完 -牛 -缴 -嗷 -炊 -恩 -荔 -茆 -掉 -紊 -慌 -莓 -羟 -阙 -萁 -磐 -另 -蕹 -辱 -鳐 -湮 -吡 -吩 -唐 -睦 -垠 -舒 -圜 -冗 -瞿 -溺 -芾 -囱 -匠 -僳 -汐 -菩 -饬 -漓 -黑 -霰 -浸 -濡 -窥 -毂 -蒡 -兢 -驻 -鹉 -芮 -诙 -迫 -雳 -厂 -忐 -臆 -猴 -鸣 -蚪 -栈 -箕 -羡 -渐 -莆 -捍 -眈 -哓 -趴 -蹼 -埕 -嚣 -骛 -宏 -淄 -斑 -噜 -严 -瑛 -垃 -椎 -诱 -压 -庾 -绞 -焘 -廿 -抡 -迄 -棘 -夫 -纬 -锹 -眨 -瞌 -侠 -脐 -竞 -瀑 -孳 -骧 -遁 -姜 -颦 -荪 -滚 -萦 -伪 -逸 -粳 -爬 -锁 -矣 -役 -趣 -洒 -颔 -诏 -逐 -奸 -甭 -惠 -攀 -蹄 -泛 -尼 -拼 -阮 -鹰 -亚 -颈 -惑 -勒 -〉 -际 -肛 -爷 -刚 -钨 -丰 -养 -冶 -鲽 -辉 -蔻 -画 -覆 -皴 -妊 -麦 -返 -醉 -皂 -擀 -〗 -酶 -凑 -粹 -悟 -诀 -硖 -港 -卜 -z -杀 -涕 -± -舍 -铠 -抵 -弛 -段 -敝 -镐 -奠 -拂 -轴 -跛 -袱 -e -t -沉 -菇 -俎 -薪 -峦 -秭 -蟹 -历 -盟 -菠 -寡 -液 -肢 -喻 -染 -裱 -悱 -抱 -氙 -赤 -捅 -猛 -跑 -氮 -谣 -仁 -尺 -辊 -窍 -烙 -衍 -架 -擦 -倏 -璐 -瑁 -币 -楞 -胖 -夔 -趸 -邛 -惴 -饕 -虔 -蝎 -§ -哉 -贝 -宽 -辫 -炮 -扩 -饲 -籽 -魏 -菟 -锰 -伍 -猝 -末 -琳 -哚 -蛎 -邂 -呀 -姿 -鄞 -却 -歧 -仙 -恸 -椐 -森 -牒 -寤 -袒 -婆 -虢 -雅 -钉 -朵 -贼 -欲 -苞 -寰 -故 -龚 -坭 -嘘 -咫 -礼 -硷 -兀 -睢 -汶 -’ -铲 -烧 -绕 -诃 -浃 -钿 -哺 -柜 -讼 -颊 -璁 -腔 -洽 -咐 -脲 -簌 -筠 -镣 -玮 -鞠 -谁 -兼 -姆 -挥 -梯 -蝴 -谘 -漕 -刷 -躏 -宦 -弼 -b -垌 -劈 -麟 -莉 -揭 -笙 -渎 -仕 -嗤 -仓 -配 -怏 -抬 -错 -泯 -镊 -孰 -猿 -邪 -仍 -秋 -鼬 -壹 -歇 -吵 -炼 -< -尧 -射 -柬 -廷 -胧 -霾 -凳 -隋 -肚 -浮 -梦 -祥 -株 -堵 -退 -L -鹫 -跎 -凶 -毽 -荟 -炫 -栩 -玳 -甜 -沂 -鹿 -顽 -伯 -爹 -赔 -蛴 -徐 -匡 -欣 -狰 -缸 -雹 -蟆 -疤 -默 -沤 -啜 -痂 -衣 -禅 -w -i -h -辽 -葳 -黝 -钗 -停 -沽 -棒 -馨 -颌 -肉 -吴 -硫 -悯 -劾 -娈 -马 -啧 -吊 -悌 -镑 -峭 -帆 -瀣 -涉 -咸 -疸 -滋 -泣 -翦 -拙 -癸 -钥 -蜒 -+ -尾 -庄 -凝 -泉 -婢 -渴 -谊 -乞 -陆 -锉 -糊 -鸦 -淮 -I -B -N -晦 -弗 -乔 -庥 -葡 -尻 -席 -橡 -傣 -渣 -拿 -惩 -麋 -斛 -缃 -矮 -蛏 -岘 -鸽 -姐 -膏 -催 -奔 -镒 -喱 -蠡 -摧 -钯 -胤 -柠 -拐 -璋 -鸥 -卢 -荡 -倾 -^ -_ -珀 -逄 -萧 -塾 -掇 -贮 -笆 -聂 -圃 -冲 -嵬 -M -滔 -笕 -值 -炙 -偶 -蜱 -搐 -梆 -汪 -蔬 -腑 -鸯 -蹇 -敞 -绯 -仨 -祯 -谆 -梧 -糗 -鑫 -啸 -豺 -囹 -猾 -巢 -柄 -瀛 -筑 -踌 -沭 -暗 -苁 -鱿 -蹉 -脂 -蘖 -牢 -热 -木 -吸 -溃 -宠 -序 -泞 -偿 -拜 -檩 -厚 -朐 -毗 -螳 -吞 -媚 -朽 -担 -蝗 -橘 -畴 -祈 -糟 -盱 -隼 -郜 -惜 -珠 -裨 -铵 -焙 -琚 -唯 -咚 -噪 -骊 -丫 -滢 -勤 -棉 -呸 -咣 -淀 -隔 -蕾 -窈 -饨 -挨 -煅 -短 -匙 -粕 -镜 -赣 -撕 -墩 -酬 -馁 -豌 -颐 -抗 -酣 -氓 -佑 -搁 -哭 -递 -耷 -涡 -桃 -贻 -碣 -截 -瘦 -昭 -镌 -蔓 -氚 -甲 -猕 -蕴 -蓬 -散 -拾 -纛 -狼 -猷 -铎 -埋 -旖 -矾 -讳 -囊 -糜 -迈 -粟 -蚂 -紧 -鲳 -瘢 -栽 -稼 -羊 -锄 -斟 -睁 -桥 -瓮 -蹙 -祉 -醺 -鼻 -昱 -剃 -跳 -篱 -跷 -蒜 -翎 -宅 -晖 -嗑 -壑 -峻 -癫 -屏 -狠 -陋 -袜 -途 -憎 -祀 -莹 -滟 -佶 -溥 -臣 -约 -盛 -峰 -磁 -慵 -婪 -拦 -莅 -朕 -鹦 -粲 -裤 -哎 -疡 -嫖 -琵 -窟 -堪 -谛 -嘉 -儡 -鳝 -斩 -郾 -驸 -酊 -妄 -胜 -贺 -徙 -傅 -噌 -钢 -栅 -庇 -恋 -匝 -巯 -邈 -尸 -锚 -粗 -佟 -蛟 -薹 -纵 -蚊 -郅 -绢 -锐 -苗 -俞 -篆 -淆 -膀 -鲜 -煎 -诶 -秽 -寻 -涮 -刺 -怀 -噶 -巨 -褰 -魅 -灶 -灌 -桉 -藕 -谜 -舸 -薄 -搀 -恽 -借 -牯 -痉 -渥 -愿 -亓 -耘 -杠 -柩 -锔 -蚶 -钣 -珈 -喘 -蹒 -幽 -赐 -稗 -晤 -莱 -泔 -扯 -肯 -菪 -裆 -腩 -豉 -疆 -骜 -腐 -倭 -珏 -唔 -粮 -亡 -润 -慰 -伽 -橄 -玄 -誉 -醐 -胆 -龊 -粼 -塬 -陇 -彼 -削 -嗣 -绾 -芽 -妗 -垭 -瘴 -爽 -薏 -寨 -龈 -泠 -弹 -赢 -漪 -猫 -嘧 -涂 -恤 -圭 -茧 -烽 -屑 -痕 -巾 -赖 -荸 -凰 -腮 -畈 -亵 -蹲 -偃 -苇 -澜 -艮 -换 -骺 -烘 -苕 -梓 -颉 -肇 -哗 -悄 -氤 -涠 -葬 -屠 -鹭 -植 -竺 -佯 -诣 -鲇 -瘀 -鲅 -邦 -移 -滁 -冯 -耕 -癔 -戌 -茬 -沁 -巩 -悠 -湘 -洪 -痹 -锟 -循 -谋 -腕 -鳃 -钠 -捞 -焉 -迎 -碱 -伫 -急 -榷 -奈 -邝 -卯 -辄 -皲 -卟 -醛 -畹 -忧 -稳 -雄 -昼 -缩 -阈 -睑 -扌 -耗 -曦 -涅 -捏 -瞧 -邕 -淖 -漉 -铝 -耦 -禹 -湛 -喽 -莼 -琅 -诸 -苎 -纂 -硅 -始 -嗨 -傥 -燃 -臂 -赅 -嘈 -呆 -贵 -屹 -壮 -肋 -亍 -蚀 -卅 -豹 -腆 -邬 -迭 -浊 -} -童 -螂 -捐 -圩 -勐 -触 -寞 -汊 -壤 -荫 -膺 -渌 -芳 -懿 -遴 -螈 -泰 -蓼 -蛤 -茜 -舅 -枫 -朔 -膝 -眙 -避 -梅 -判 -鹜 -璜 -牍 -缅 -垫 -藻 -黔 -侥 -惚 -懂 -踩 -腰 -腈 -札 -丞 -唾 -慈 -顿 -摹 -荻 -琬 -~ -斧 -沈 -滂 -胁 -胀 -幄 -莜 -Z -匀 -鄄 -掌 -绰 -茎 -焚 -赋 -萱 -谑 -汁 -铒 -瞎 -夺 -蜗 -野 -娆 -冀 -弯 -篁 -懵 -灞 -隽 -芡 -脘 -俐 -辩 -芯 -掺 -喏 -膈 -蝈 -觐 -悚 -踹 -蔗 -熠 -鼠 -呵 -抓 -橼 -峨 -畜 -缔 -禾 -崭 -弃 -熊 -摒 -凸 -拗 -穹 -蒙 -抒 -祛 -劝 -闫 -扳 -阵 -醌 -踪 -喵 -侣 -搬 -仅 -荧 -赎 -蝾 -琦 -买 -婧 -瞄 -寓 -皎 -冻 -赝 -箩 -莫 -瞰 -郊 -笫 -姝 -筒 -枪 -遣 -煸 -袋 -舆 -痱 -涛 -母 -〇 -启 -践 -耙 -绲 -盘 -遂 -昊 -搞 -槿 -诬 -纰 -泓 -惨 -檬 -亻 -越 -C -o -憩 -熵 -祷 -钒 -暧 -塔 -阗 -胰 -咄 -娶 -魔 -琶 -钞 -邻 -扬 -杉 -殴 -咽 -弓 -〆 -髻 -】 -吭 -揽 -霆 -拄 -殖 -脆 -彻 -岩 -芝 -勃 -辣 -剌 -钝 -嘎 -甄 -佘 -皖 -伦 -授 -徕 -憔 -挪 -皇 -庞 -稔 -芜 -踏 -溴 -兖 -卒 -擢 -饥 -鳞 -煲 -‰ -账 -颗 -叻 -斯 -捧 -鳍 -琮 -讹 -蛙 -纽 -谭 -酸 -兔 -莒 -睇 -伟 -觑 -羲 -嗜 -宜 -褐 -旎 -辛 -卦 -诘 -筋 -鎏 -溪 -挛 -熔 -阜 -晰 -鳅 -丢 -奚 -灸 -呱 -献 -陉 -黛 -鸪 -甾 -萨 -疮 -拯 -洲 -疹 -辑 -叙 -恻 -谒 -允 -柔 -烂 -氏 -逅 -漆 -拎 -惋 -扈 -湟 -纭 -啕 -掬 -擞 -哥 -忽 -涤 -鸵 -靡 -郗 -瓷 -扁 -廊 -怨 -雏 -钮 -敦 -E -懦 -憋 -汀 -拚 -啉 -腌 -岸 -f -痼 -瞅 -尊 -咀 -眩 -飙 -忌 -仝 -迦 -熬 -毫 -胯 -篑 -茄 -腺 -凄 -舛 -碴 -锵 -诧 -羯 -後 -漏 -汤 -宓 -仞 -蚁 -壶 -谰 -皑 -铄 -棰 -罔 -辅 -晶 -苦 -牟 -闽 -\ -烃 -饮 -聿 -丙 -蛳 -朱 -煤 -涔 -鳖 -犁 -罐 -荼 -砒 -淦 -妤 -黏 -戎 -孑 -婕 -瑾 -戢 -钵 -枣 -捋 -砥 -衩 -狙 -桠 -稣 -阎 -肃 -梏 -诫 -孪 -昶 -婊 -衫 -嗔 -侃 -塞 -蜃 -樵 -峒 -貌 -屿 -欺 -缫 -阐 -栖 -诟 -珞 -荭 -吝 -萍 -嗽 -恂 -啻 -蜴 -磬 -峋 -俸 -豫 -谎 -徊 -镍 -韬 -魇 -晴 -U -囟 -猜 -蛮 -坐 -囿 -伴 -亭 -肝 -佗 -蝠 -妃 -胞 -滩 -榴 -氖 -垩 -苋 -砣 -扪 -馏 -姓 -轩 -厉 -夥 -侈 -禀 -垒 -岑 -赏 -钛 -辐 -痔 -披 -纸 -碳 -“ -坞 -蠓 -挤 -荥 -沅 -悔 -铧 -帼 -蒌 -蝇 -a -p -y -n -g -哀 -浆 -瑶 -凿 -桶 -馈 -皮 -奴 -苜 -佤 -伶 -晗 -铱 -炬 -优 -弊 -氢 -恃 -甫 -攥 -端 -锌 -灰 -稹 -炝 -曙 -邋 -亥 -眶 -碾 -拉 -萝 -绔 -捷 -浍 -腋 -姑 -菖 -凌 -涞 -麽 -锢 -桨 -潢 -绎 -镰 -殆 -锑 -渝 -铬 -困 -绽 -觎 -匈 -糙 -暑 -裹 -鸟 -盔 -肽 -迷 -綦 -『 -亳 -佝 -俘 -钴 -觇 -骥 -仆 -疝 -跪 -婶 -郯 -瀹 -唉 -脖 -踞 -针 -晾 -忒 -扼 -瞩 -叛 -椒 -疟 -嗡 -邗 -肆 -跆 -玫 -忡 -捣 -咧 -唆 -艄 -蘑 -潦 -笛 -阚 -沸 -泻 -掊 -菽 -贫 -斥 -髂 -孢 -镂 -赂 -麝 -鸾 -屡 -衬 -苷 -恪 -叠 -希 -粤 -爻 -喝 -茫 -惬 -郸 -绻 -庸 -撅 -碟 -宄 -妹 -膛 -叮 -饵 -崛 -嗲 -椅 -冤 -搅 -咕 -敛 -尹 -垦 -闷 -蝉 -霎 -勰 -败 -蓑 -泸 -肤 -鹌 -幌 -焦 -浠 -鞍 -刁 -舰 -乙 -竿 -裔 -。 -茵 -函 -伊 -兄 -丨 -娜 -匍 -謇 -莪 -宥 -似 -蝽 -翳 -酪 -翠 -粑 -薇 -祢 -骏 -赠 -叫 -Q -噤 -噻 -竖 -芗 -莠 -潭 -俊 -羿 -耜 -O -郫 -趁 -嗪 -囚 -蹶 -芒 -洁 -笋 -鹑 -敲 -硝 -啶 -堡 -渲 -揩 -』 -携 -宿 -遒 -颍 -扭 -棱 -割 -萜 -蔸 -葵 -琴 -捂 -饰 -衙 -耿 -掠 -募 -岂 -窖 -涟 -蔺 -瘤 -柞 -瞪 -怜 -匹 -距 -楔 -炜 -哆 -秦 -缎 -幼 -茁 -绪 -痨 -恨 -楸 -娅 -瓦 -桩 -雪 -嬴 -伏 -榔 -妥 -铿 -拌 -眠 -雍 -缇 -‘ -卓 -搓 -哌 -觞 -噩 -屈 -哧 -髓 -咦 -巅 -娑 -侑 -淫 -膳 -祝 -勾 -姊 -莴 -胄 -疃 -薛 -蜷 -胛 -巷 -芙 -芋 -熙 -闰 -勿 -窃 -狱 -剩 -钏 -幢 -陟 -铛 -慧 -靴 -耍 -k -浙 -浇 -飨 -惟 -绗 -祜 -澈 -啼 -咪 -磷 -摞 -诅 -郦 -抹 -跃 -壬 -吕 -肖 -琏 -颤 -尴 -剡 -抠 -凋 -赚 -泊 -津 -宕 -殷 -倔 -氲 -漫 -邺 -涎 -怠 -$ -垮 -荬 -遵 -俏 -叹 -噢 -饽 -蜘 -孙 -筵 -疼 -鞭 -羧 -牦 -箭 -潴 -c -眸 -祭 -髯 -啖 -坳 -愁 -芩 -驮 -倡 -巽 -穰 -沃 -胚 -怒 -凤 -槛 -剂 -趵 -嫁 -v -邢 -灯 -鄢 -桐 -睽 -檗 -锯 -槟 -婷 -嵋 -圻 -诗 -蕈 -颠 -遭 -痢 -芸 -怯 -馥 -竭 -锗 -徜 -恭 -遍 -籁 -剑 -嘱 -苡 -龄 -僧 -桑 -潸 -弘 -澶 -楹 -悲 -讫 -愤 -腥 -悸 -谍 -椹 -呢 -桓 -葭 -攫 -阀 -翰 -躲 -敖 -柑 -郎 -笨 -橇 -呃 -魁 -燎 -脓 -葩 -磋 -垛 -玺 -狮 -沓 -砜 -蕊 -锺 -罹 -蕉 -翱 -虐 -闾 -巫 -旦 -茱 -嬷 -枯 -鹏 -贡 -芹 -汛 -矫 -绁 -拣 -禺 -佃 -讣 -舫 -惯 -乳 -趋 -疲 -挽 -岚 -虾 -衾 -蠹 -蹂 -飓 -氦 -铖 -孩 -稞 -瑜 -壅 -掀 -勘 -妓 -畅 -髋 -W -庐 -牲 -蓿 -榕 -练 -垣 -唱 -邸 -菲 -昆 -婺 -穿 -绡 -麒 -蚱 -掂 -愚 -泷 -涪 -漳 -妩 -娉 -榄 -讷 -觅 -旧 -藤 -煮 -呛 -柳 -腓 -叭 -庵 -烷 -阡 -罂 -蜕 -擂 -猖 -咿 -媲 -脉 -【 -沏 -貅 -黠 -熏 -哲 -烁 -坦 -酵 -兜 -× -潇 -撒 -剽 -珩 -圹 -乾 -摸 -樟 -帽 -嗒 -襄 -魂 -轿 -憬 -锡 -〕 -喃 -皆 -咖 -隅 -脸 -残 -泮 -袂 -鹂 -珊 -囤 -捆 -咤 -误 -徨 -闹 -淙 -芊 -淋 -怆 -囗 -拨 -梳 -渤 -R -G -绨 -蚓 -婀 -幡 -狩 -麾 -谢 -唢 -裸 -旌 -伉 -纶 -裂 -驳 -砼 -咛 -澄 -樨 -蹈 -宙 -澍 -倍 -貔 -操 -勇 -蟠 -摈 -砧 -虬 -够 -缁 -悦 -藿 -撸 -艹 -摁 -淹 -豇 -虎 -榭 -ˉ -吱 -d -° -喧 -荀 -踱 -侮 -奋 -偕 -饷 -犍 -惮 -坑 -璎 -徘 -宛 -妆 -袈 -倩 -窦 -昂 -荏 -乖 -K -怅 -撰 -鳙 -牙 -袁 -酞 -X -痿 -琼 -闸 -雁 -趾 -荚 -虻 -涝 -《 -杏 -韭 -偈 -烤 -绫 -鞘 -卉 -症 -遢 -蓥 -诋 -杭 -荨 -匆 -竣 -簪 -辙 -敕 -虞 -丹 -缭 -咩 -黟 -m -淤 -瑕 -咂 -铉 -硼 -茨 -嶂 -痒 -畸 -敬 -涿 -粪 -窘 -熟 -叔 -嫔 -盾 -忱 -裘 -憾 -梵 -赡 -珙 -咯 -娘 -庙 -溯 -胺 -葱 -痪 -摊 -荷 -卞 -乒 -髦 -寐 -铭 -坩 -胗 -枷 -爆 -溟 -嚼 -羚 -砬 -轨 -惊 -挠 -罄 -竽 -菏 -氧 -浅 -楣 -盼 -枢 -炸 -阆 -杯 -谏 -噬 -淇 -渺 -俪 -秆 -墓 -泪 -跻 -砌 -痰 -垡 -渡 -耽 -釜 -讶 -鳎 -煞 -呗 -韶 -舶 -绷 -鹳 -缜 -旷 -铊 -皱 -龌 -檀 -霖 -奄 -槐 -艳 -蝶 -旋 -哝 -赶 -骞 -蚧 -腊 -盈 -丁 -` -蜚 -矸 -蝙 -睨 -嚓 -僻 -鬼 -醴 -夜 -彝 -磊 -笔 -拔 -栀 -糕 -厦 -邰 -纫 -逭 -纤 -眦 -膊 -馍 -躇 -烯 -蘼 -冬 -诤 -暄 -骶 -哑 -瘠 -」 -臊 -丕 -愈 -咱 -螺 -擅 -跋 -搏 -硪 -谄 -笠 -淡 -嘿 -骅 -谧 -鼎 -皋 -姚 -歼 -蠢 -驼 -耳 -胬 -挝 -涯 -狗 -蒽 -孓 -犷 -凉 -芦 -箴 -铤 -孤 -嘛 -坤 -V -茴 -朦 -挞 -尖 -橙 -诞 -搴 -碇 -洵 -浚 -帚 -蜍 -漯 -柘 -嚎 -讽 -芭 -荤 -咻 -祠 -秉 -跖 -埃 -吓 -糯 -眷 -馒 -惹 -娼 -鲑 -嫩 -讴 -轮 -瞥 -靶 -褚 -乏 -缤 -宋 -帧 -删 -驱 -碎 -扑 -俩 -俄 -偏 -涣 -竹 -噱 -皙 -佰 -渚 -唧 -斡 -# -镉 -刀 -崎 -筐 -佣 -夭 -贰 -肴 -峙 -哔 -艿 -匐 -牺 -镛 -缘 -仡 -嫡 -劣 -枸 -堀 -梨 -簿 -鸭 -蒸 -亦 -稽 -浴 -{ -衢 -束 -槲 -j -阁 -揍 -疥 -棋 -潋 -聪 -窜 -乓 -睛 -插 -冉 -阪 -苍 -搽 -「 -蟾 -螟 -幸 -仇 -樽 -撂 -慢 -跤 -幔 -俚 -淅 -覃 -觊 -溶 -妖 -帛 -侨 -曰 -妾 -泗 -· -: -瀘 -風 -Ë -( -) -∶ -紅 -紗 -瑭 -雲 -頭 -鶏 -財 -許 -• -¥ -樂 -焗 -麗 -— -; -滙 -東 -榮 -繪 -興 -… -門 -業 -π -楊 -國 -顧 -é -盤 -寳 -Λ -龍 -鳳 -島 -誌 -緣 -結 -銭 -萬 -勝 -祎 -璟 -優 -歡 -臨 -時 -購 -= -★ -藍 -昇 -鐵 -觀 -勅 -農 -聲 -畫 -兿 -術 -發 -劉 -記 -專 -耑 -園 -書 -壴 -種 -Ο -● -褀 -號 -銀 -匯 -敟 -锘 -葉 -橪 -廣 -進 -蒄 -鑽 -阝 -祙 -貢 -鍋 -豊 -夬 -喆 -團 -閣 -開 -燁 -賓 -館 -酡 -沔 -順 -+ -硚 -劵 -饸 -陽 -車 -湓 -復 -萊 -氣 -軒 -華 -堃 -迮 -纟 -戶 -馬 -學 -裡 -電 -嶽 -獨 -マ -シ -サ -ジ -燘 -袪 -環 -❤ -臺 -灣 -専 -賣 -孖 -聖 -攝 -線 -▪ -α -傢 -俬 -夢 -達 -莊 -喬 -貝 -薩 -劍 -羅 -壓 -棛 -饦 -尃 -璈 -囍 -醫 -G -I -A -# -N -鷄 -髙 -嬰 -啓 -約 -隹 -潔 -賴 -藝 -~ -寶 -籣 -麺 -  -嶺 -√ -義 -網 -峩 -長 -∧ -魚 -機 -構 -② -鳯 -偉 -L -B -㙟 -畵 -鴿 -' -詩 -溝 -嚞 -屌 -藔 -佧 -玥 -蘭 -織 -1 -3 -9 -0 -7 -點 -砭 -鴨 -鋪 -銘 -廳 -弍 -‧ -創 -湯 -坶 -℃ -卩 -骝 -& -烜 -荘 -當 -潤 -扞 -係 -懷 -碶 -钅 -蚨 -讠 -☆ -叢 -爲 -埗 -涫 -塗 -→ -楽 -現 -鯨 -愛 -瑪 -鈺 -忄 -悶 -藥 -飾 -樓 -視 -孬 -ㆍ -燚 -苪 -師 -① -丼 -锽 -│ -韓 -標 -è -兒 -閏 -匋 -張 -漢 -Ü -髪 -會 -閑 -檔 -習 -裝 -の -峯 -菘 -輝 -И -雞 -釣 -億 -浐 -K -O -R -8 -H -E -P -T -W -D -S -C -M -F -姌 -饹 -» -晞 -廰 -ä -嵯 -鷹 -負 -飲 -絲 -冚 -楗 -澤 -綫 -區 -❋ -← -質 -靑 -揚 -③ -滬 -統 -産 -協 -﹑ -乸 -畐 -經 -運 -際 -洺 -岽 -為 -粵 -諾 -崋 -豐 -碁 -ɔ -V -2 -6 -齋 -誠 -訂 -´ -勑 -雙 -陳 -無 -í -泩 -媄 -夌 -刂 -i -c -t -o -r -a -嘢 -耄 -燴 -暃 -壽 -媽 -靈 -抻 -體 -唻 -É -冮 -甹 -鎮 -錦 -ʌ -蜛 -蠄 -尓 -駕 -戀 -飬 -逹 -倫 -貴 -極 -Я -Й -寬 -磚 -嶪 -郎 -職 -| -間 -n -d -剎 -伈 -課 -飛 -橋 -瘊 -№ -譜 -骓 -圗 -滘 -縣 -粿 -咅 -養 -濤 -彳 -® -% -Ⅱ -啰 -㴪 -見 -矞 -薬 -糁 -邨 -鲮 -顔 -罱 -З -選 -話 -贏 -氪 -俵 -競 -瑩 -繡 -枱 -β -綉 -á -獅 -爾 -™ -麵 -戋 -淩 -徳 -個 -劇 -場 -務 -簡 -寵 -h -實 -膠 -轱 -圖 -築 -嘣 -樹 -㸃 -營 -耵 -孫 -饃 -鄺 -飯 -麯 -遠 -輸 -坫 -孃 -乚 -閃 -鏢 -㎡ -題 -廠 -關 -↑ -爺 -將 -軍 -連 -篦 -覌 -參 -箸 -- -窠 -棽 -寕 -夀 -爰 -歐 -呙 -閥 -頡 -熱 -雎 -垟 -裟 -凬 -勁 -帑 -馕 -夆 -疌 -枼 -馮 -貨 -蒤 -樸 -彧 -旸 -靜 -龢 -暢 -㐱 -鳥 -珺 -鏡 -灡 -爭 -堷 -廚 -Ó -騰 -診 -┅ -蘇 -褔 -凱 -頂 -豕 -亞 -帥 -嘬 -⊥ -仺 -桖 -複 -饣 -絡 -穂 -顏 -棟 -納 -▏ -濟 -親 -設 -計 -攵 -埌 -烺 -ò -頤 -燦 -蓮 -撻 -節 -講 -濱 -濃 -娽 -洳 -朿 -燈 -鈴 -護 -膚 -铔 -過 -補 -Z -U -5 -4 -坋 -闿 -䖝 -餘 -缐 -铞 -貿 -铪 -桼 -趙 -鍊 -[ -㐂 -垚 -菓 -揸 -捲 -鐘 -滏 -𣇉 -爍 -輪 -燜 -鴻 -鮮 -動 -鹞 -鷗 -丄 -慶 -鉌 -翥 -飮 -腸 -⇋ -漁 -覺 -來 -熘 -昴 -翏 -鲱 -圧 -鄉 -萭 -頔 -爐 -嫚 -г -貭 -類 -聯 -幛 -輕 -訓 -鑒 -夋 -锨 -芃 -珣 -䝉 -扙 -嵐 -銷 -處 -ㄱ -語 -誘 -苝 -歸 -儀 -燒 -楿 -內 -粢 -葒 -奧 -麥 -礻 -滿 -蠔 -穵 -瞭 -態 -鱬 -榞 -硂 -鄭 -黃 -煙 -祐 -奓 -逺 -* -瑄 -獲 -聞 -薦 -讀 -這 -樣 -決 -問 -啟 -們 -執 -説 -轉 -單 -隨 -唘 -帶 -倉 -庫 -還 -贈 -尙 -皺 -■ -餅 -產 -○ -∈ -報 -狀 -楓 -賠 -琯 -嗮 -禮 -` -傳 -> -≤ -嗞 -Φ -≥ -換 -咭 -∣ -↓ -曬 -ε -応 -寫 -″ -終 -様 -純 -費 -療 -聨 -凍 -壐 -郵 -ü -黒 -∫ -製 -塊 -調 -軽 -確 -撃 -級 -馴 -Ⅲ -涇 -繹 -數 -碼 -證 -狒 -処 -劑 -< -晧 -賀 -衆 -] -櫥 -兩 -陰 -絶 -對 -鯉 -憶 -◎ -p -e -Y -蕒 -煖 -頓 -測 -試 -鼽 -僑 -碩 -妝 -帯 -≈ -鐡 -舖 -權 -喫 -倆 -ˋ -該 -悅 -ā -俫 -. -f -s -b -m -k -g -u -j -貼 -淨 -濕 -針 -適 -備 -l -/ -給 -謢 -強 -觸 -衛 -與 -⊙ -$ -緯 -變 -⑴ -⑵ -⑶ -㎏ -殺 -∩ -幚 -─ -價 -▲ -離 -ú -ó -飄 -烏 -関 -閟 -﹝ -﹞ -邏 -輯 -鍵 -驗 -訣 -導 -歷 -屆 -層 -▼ -儱 -錄 -熳 -ē -艦 -吋 -錶 -辧 -飼 -顯 -④ -禦 -販 -気 -対 -枰 -閩 -紀 -幹 -瞓 -貊 -淚 -△ -眞 -墊 -Ω -獻 -褲 -縫 -緑 -亜 -鉅 -餠 -{ -} -◆ -蘆 -薈 -█ -◇ -溫 -彈 -晳 -粧 -犸 -穩 -訊 -崬 -凖 -熥 -П -舊 -條 -紋 -圍 -Ⅳ -筆 -尷 -難 -雜 -錯 -綁 -識 -頰 -鎖 -艶 -□ -殁 -殼 -⑧ -├ -▕ -鵬 -ǐ -ō -ǒ -糝 -綱 -▎ -μ -盜 -饅 -醬 -籤 -蓋 -釀 -鹽 -據 -à -ɡ -辦 -◥ -彐 -┌ -婦 -獸 -鲩 -伱 -ī -蒟 -蒻 -齊 -袆 -腦 -寧 -凈 -妳 -煥 -詢 -偽 -謹 -啫 -鯽 -騷 -鱸 -損 -傷 -鎻 -髮 -買 -冏 -儥 -両 -﹢ -∞ -載 -喰 -z -羙 -悵 -燙 -曉 -員 -組 -徹 -艷 -痠 -鋼 -鼙 -縮 -細 -嚒 -爯 -≠ -維 -" -鱻 -壇 -厍 -帰 -浥 -犇 -薡 -軎 -² -應 -醜 -刪 -緻 -鶴 -賜 -噁 -軌 -尨 -镔 -鷺 -槗 -彌 -葚 -濛 -請 -溇 -緹 -賢 -訪 -獴 -瑅 -資 -縤 -陣 -蕟 -栢 -韻 -祼 -恁 -伢 -謝 -劃 -涑 -總 -衖 -踺 -砋 -凉 -籃 -駿 -苼 -瘋 -昽 -紡 -驊 -腎 -﹗ -響 -杋 -剛 -嚴 -禪 -歓 -槍 -傘 -檸 -檫 -炣 -勢 -鏜 -鎢 -銑 -尐 -減 -奪 -惡 -θ -僮 -婭 -臘 -ū -ì -殻 -鉄 -∑ -蛲 -焼 -緖 -續 -紹 -懮 \ No newline at end of file diff --git a/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/utils/resources/dict/ppocrv4_doc_dict.txt b/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/utils/resources/dict/ppocrv4_doc_dict.txt deleted file mode 100644 index 09e275bae943431ae75f583b9f4519c96161eb85..0000000000000000000000000000000000000000 --- a/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/utils/resources/dict/ppocrv4_doc_dict.txt +++ /dev/null @@ -1,15629 +0,0 @@ -' -疗 -绚 -诚 -娇 -溜 -题 -贿 -者 -廖 -更 -纳 -加 -奉 -公 -一 -就 -汴 -计 -与 -路 -房 -原 -妇 -2 -0 -8 -- -7 -其 -> -: -] -, -, -骑 -刈 -全 -消 -昏 -傈 -安 -久 -钟 -嗅 -不 -影 -处 -驽 -蜿 -资 -关 -椤 -地 -瘸 -专 -问 -忖 -票 -嫉 -炎 -韵 -要 -月 -田 -节 -陂 -鄙 -捌 -备 -拳 -伺 -眼 -网 -盎 -大 -傍 -心 -东 -愉 -汇 -蹿 -科 -每 -业 -里 -航 -晏 -字 -平 -录 -先 -1 -3 -彤 -鲶 -产 -稍 -督 -腴 -有 -象 -岳 -注 -绍 -在 -泺 -文 -定 -核 -名 -水 -过 -理 -让 -偷 -率 -等 -这 -发 -” -为 -含 -肥 -酉 -相 -鄱 -七 -编 -猥 -锛 -日 -镀 -蒂 -掰 -倒 -辆 -栾 -栗 -综 -涩 -州 -雌 -滑 -馀 -了 -机 -块 -司 -宰 -甙 -兴 -矽 -抚 -保 -用 -沧 -秩 -如 -收 -息 -滥 -页 -疑 -埠 -! -! -姥 -异 -橹 -钇 -向 -下 -跄 -的 -椴 -沫 -国 -绥 -獠 -报 -开 -民 -蜇 -何 -分 -凇 -长 -讥 -藏 -掏 -施 -羽 -中 -讲 -派 -嘟 -人 -提 -浼 -间 -世 -而 -古 -多 -倪 -唇 -饯 -控 -庚 -首 -赛 -蜓 -味 -断 -制 -觉 -技 -替 -艰 -溢 -潮 -夕 -钺 -外 -摘 -枋 -动 -双 -单 -啮 -户 -枇 -确 -锦 -曜 -杜 -或 -能 -效 -霜 -盒 -然 -侗 -电 -晁 -放 -步 -鹃 -新 -杖 -蜂 -吒 -濂 -瞬 -评 -总 -隍 -对 -独 -合 -也 -是 -府 -青 -天 -诲 -墙 -组 -滴 -级 -邀 -帘 -示 -已 -时 -骸 -仄 -泅 -和 -遨 -店 -雇 -疫 -持 -巍 -踮 -境 -只 -亨 -目 -鉴 -崤 -闲 -体 -泄 -杂 -作 -般 -轰 -化 -解 -迂 -诿 -蛭 -璀 -腾 -告 -版 -服 -省 -师 -小 -规 -程 -线 -海 -办 -引 -二 -桧 -牌 -砺 -洄 -裴 -修 -图 -痫 -胡 -许 -犊 -事 -郛 -基 -柴 -呼 -食 -研 -奶 -律 -蛋 -因 -葆 -察 -戏 -褒 -戒 -再 -李 -骁 -工 -貂 -油 -鹅 -章 -啄 -休 -场 -给 -睡 -纷 -豆 -器 -捎 -说 -敏 -学 -会 -浒 -设 -诊 -格 -廓 -查 -来 -霓 -室 -溆 -¢ -诡 -寥 -焕 -舜 -柒 -狐 -回 -戟 -砾 -厄 -实 -翩 -尿 -五 -入 -径 -惭 -喹 -股 -宇 -篝 -| -; -美 -期 -云 -九 -祺 -扮 -靠 -锝 -槌 -系 -企 -酰 -阊 -暂 -蚕 -忻 -豁 -本 -羹 -执 -条 -钦 -H -獒 -限 -进 -季 -楦 -于 -芘 -玖 -铋 -茯 -未 -答 -粘 -括 -样 -精 -欠 -矢 -甥 -帷 -嵩 -扣 -令 -仔 -风 -皈 -行 -支 -部 -蓉 -刮 -站 -蜡 -救 -钊 -汗 -松 -嫌 -成 -可 -. -鹤 -院 -从 -交 -政 -怕 -活 -调 -球 -局 -验 -髌 -第 -韫 -谗 -串 -到 -圆 -年 -米 -/ -* -友 -忿 -检 -区 -看 -自 -敢 -刃 -个 -兹 -弄 -流 -留 -同 -没 -齿 -星 -聆 -轼 -湖 -什 -三 -建 -蛔 -儿 -椋 -汕 -震 -颧 -鲤 -跟 -力 -情 -璺 -铨 -陪 -务 -指 -族 -训 -滦 -鄣 -濮 -扒 -商 -箱 -十 -召 -慷 -辗 -所 -莞 -管 -护 -臭 -横 -硒 -嗓 -接 -侦 -六 -露 -党 -馋 -驾 -剖 -高 -侬 -妪 -幂 -猗 -绺 -骐 -央 -酐 -孝 -筝 -课 -徇 -缰 -门 -男 -西 -项 -句 -谙 -瞒 -秃 -篇 -教 -碲 -罚 -声 -呐 -景 -前 -富 -嘴 -鳌 -稀 -免 -朋 -啬 -睐 -去 -赈 -鱼 -住 -肩 -愕 -速 -旁 -波 -厅 -健 -茼 -厥 -鲟 -谅 -投 -攸 -炔 -数 -方 -击 -呋 -谈 -绩 -别 -愫 -僚 -躬 -鹧 -胪 -炳 -招 -喇 -膨 -泵 -蹦 -毛 -结 -5 -4 -谱 -识 -陕 -粽 -婚 -拟 -构 -且 -搜 -任 -潘 -比 -郢 -妨 -醪 -陀 -桔 -碘 -扎 -选 -哈 -骷 -楷 -亿 -明 -缆 -脯 -监 -睫 -逻 -婵 -共 -赴 -淝 -凡 -惦 -及 -达 -揖 -谩 -澹 -减 -焰 -蛹 -番 -祁 -柏 -员 -禄 -怡 -峤 -龙 -白 -叽 -生 -闯 -起 -细 -装 -谕 -竟 -聚 -钙 -上 -导 -渊 -按 -艾 -辘 -挡 -耒 -盹 -饪 -臀 -记 -邮 -蕙 -受 -各 -医 -搂 -普 -滇 -朗 -茸 -带 -翻 -酚 -( -光 -堤 -墟 -蔷 -万 -幻 -〓 -瑙 -辈 -昧 -盏 -亘 -蛀 -吉 -铰 -请 -子 -假 -闻 -税 -井 -诩 -哨 -嫂 -好 -面 -琐 -校 -馊 -鬣 -缂 -营 -访 -炖 -占 -农 -缀 -否 -经 -钚 -棵 -趟 -张 -亟 -吏 -茶 -谨 -捻 -论 -迸 -堂 -玉 -信 -吧 -瞠 -乡 -姬 -寺 -咬 -溏 -苄 -皿 -意 -赉 -宝 -尔 -钰 -艺 -特 -唳 -踉 -都 -荣 -倚 -登 -荐 -丧 -奇 -涵 -批 -炭 -近 -符 -傩 -感 -道 -着 -菊 -虹 -仲 -众 -懈 -濯 -颞 -眺 -南 -释 -北 -缝 -标 -既 -茗 -整 -撼 -迤 -贲 -挎 -耱 -拒 -某 -妍 -卫 -哇 -英 -矶 -藩 -治 -他 -元 -领 -膜 -遮 -穗 -蛾 -飞 -荒 -棺 -劫 -么 -市 -火 -温 -拈 -棚 -洼 -转 -果 -奕 -卸 -迪 -伸 -泳 -斗 -邡 -侄 -涨 -屯 -萋 -胭 -氡 -崮 -枞 -惧 -冒 -彩 -斜 -手 -豚 -随 -旭 -淑 -妞 -形 -菌 -吲 -沱 -争 -驯 -歹 -挟 -兆 -柱 -传 -至 -包 -内 -响 -临 -红 -功 -弩 -衡 -寂 -禁 -老 -棍 -耆 -渍 -织 -害 -氵 -渑 -布 -载 -靥 -嗬 -虽 -苹 -咨 -娄 -库 -雉 -榜 -帜 -嘲 -套 -瑚 -亲 -簸 -欧 -边 -6 -腿 -旮 -抛 -吹 -瞳 -得 -镓 -梗 -厨 -继 -漾 -愣 -憨 -士 -策 -窑 -抑 -躯 -襟 -脏 -参 -贸 -言 -干 -绸 -鳄 -穷 -藜 -音 -折 -详 -) -举 -悍 -甸 -癌 -黎 -谴 -死 -罩 -迁 -寒 -驷 -袖 -媒 -蒋 -掘 -模 -纠 -恣 -观 -祖 -蛆 -碍 -位 -稿 -主 -澧 -跌 -筏 -京 -锏 -帝 -贴 -证 -糠 -才 -黄 -鲸 -略 -炯 -饱 -四 -出 -园 -犀 -牧 -容 -汉 -杆 -浈 -汰 -瑷 -造 -虫 -瘩 -怪 -驴 -济 -应 -花 -沣 -谔 -夙 -旅 -价 -矿 -以 -考 -s -u -呦 -晒 -巡 -茅 -准 -肟 -瓴 -詹 -仟 -褂 -译 -桌 -混 -宁 -怦 -郑 -抿 -些 -余 -鄂 -饴 -攒 -珑 -群 -阖 -岔 -琨 -藓 -预 -环 -洮 -岌 -宀 -杲 -瀵 -最 -常 -囡 -周 -踊 -女 -鼓 -袭 -喉 -简 -范 -薯 -遐 -疏 -粱 -黜 -禧 -法 -箔 -斤 -遥 -汝 -奥 -直 -贞 -撑 -置 -绱 -集 -她 -馅 -逗 -钧 -橱 -魉 -[ -恙 -躁 -唤 -9 -旺 -膘 -待 -脾 -惫 -购 -吗 -依 -盲 -度 -瘿 -蠖 -俾 -之 -镗 -拇 -鲵 -厝 -簧 -续 -款 -展 -啃 -表 -剔 -品 -钻 -腭 -损 -清 -锶 -统 -涌 -寸 -滨 -贪 -链 -吠 -冈 -伎 -迥 -咏 -吁 -览 -防 -迅 -失 -汾 -阔 -逵 -绀 -蔑 -列 -川 -凭 -努 -熨 -揪 -利 -俱 -绉 -抢 -鸨 -我 -即 -责 -膦 -易 -毓 -鹊 -刹 -玷 -岿 -空 -嘞 -绊 -排 -术 -估 -锷 -违 -们 -苟 -铜 -播 -肘 -件 -烫 -审 -鲂 -广 -像 -铌 -惰 -铟 -巳 -胍 -鲍 -康 -憧 -色 -恢 -想 -拷 -尤 -疳 -知 -S -Y -F -D -A -峄 -裕 -帮 -握 -搔 -氐 -氘 -难 -墒 -沮 -雨 -叁 -缥 -悴 -藐 -湫 -娟 -苑 -稠 -颛 -簇 -后 -阕 -闭 -蕤 -缚 -怎 -佞 -码 -嘤 -蔡 -痊 -舱 -螯 -帕 -赫 -昵 -升 -烬 -岫 -、 -疵 -蜻 -髁 -蕨 -隶 -烛 -械 -丑 -盂 -梁 -强 -鲛 -由 -拘 -揉 -劭 -龟 -撤 -钩 -呕 -孛 -费 -妻 -漂 -求 -阑 -崖 -秤 -甘 -通 -深 -补 -赃 -坎 -床 -啪 -承 -吼 -量 -暇 -钼 -烨 -阂 -擎 -脱 -逮 -称 -P -神 -属 -矗 -华 -届 -狍 -葑 -汹 -育 -患 -窒 -蛰 -佼 -静 -槎 -运 -鳗 -庆 -逝 -曼 -疱 -克 -代 -官 -此 -麸 -耧 -蚌 -晟 -例 -础 -榛 -副 -测 -唰 -缢 -迹 -灬 -霁 -身 -岁 -赭 -扛 -又 -菡 -乜 -雾 -板 -读 -陷 -徉 -贯 -郁 -虑 -变 -钓 -菜 -圾 -现 -琢 -式 -乐 -维 -渔 -浜 -左 -吾 -脑 -钡 -警 -T -啵 -拴 -偌 -漱 -湿 -硕 -止 -骼 -魄 -积 -燥 -联 -踢 -玛 -则 -窿 -见 -振 -畿 -送 -班 -钽 -您 -赵 -刨 -印 -讨 -踝 -籍 -谡 -舌 -崧 -汽 -蔽 -沪 -酥 -绒 -怖 -财 -帖 -肱 -私 -莎 -勋 -羔 -霸 -励 -哼 -帐 -将 -帅 -渠 -纪 -婴 -娩 -岭 -厘 -滕 -吻 -伤 -坝 -冠 -戊 -隆 -瘁 -介 -涧 -物 -黍 -并 -姗 -奢 -蹑 -掣 -垸 -锴 -命 -箍 -捉 -病 -辖 -琰 -眭 -迩 -艘 -绌 -繁 -寅 -若 -毋 -思 -诉 -类 -诈 -燮 -轲 -酮 -狂 -重 -反 -职 -筱 -县 -委 -磕 -绣 -奖 -晋 -濉 -志 -徽 -肠 -呈 -獐 -坻 -口 -片 -碰 -几 -村 -柿 -劳 -料 -获 -亩 -惕 -晕 -厌 -号 -罢 -池 -正 -鏖 -煨 -家 -棕 -复 -尝 -懋 -蜥 -锅 -岛 -扰 -队 -坠 -瘾 -钬 -@ -卧 -疣 -镇 -譬 -冰 -彷 -频 -黯 -据 -垄 -采 -八 -缪 -瘫 -型 -熹 -砰 -楠 -襁 -箐 -但 -嘶 -绳 -啤 -拍 -盥 -穆 -傲 -洗 -盯 -塘 -怔 -筛 -丿 -台 -恒 -喂 -葛 -永 -¥ -烟 -酒 -桦 -书 -砂 -蚝 -缉 -态 -瀚 -袄 -圳 -轻 -蛛 -超 -榧 -遛 -姒 -奘 -铮 -右 -荽 -望 -偻 -卡 -丶 -氰 -附 -做 -革 -索 -戚 -坨 -桷 -唁 -垅 -榻 -岐 -偎 -坛 -莨 -山 -殊 -微 -骇 -陈 -爨 -推 -嗝 -驹 -澡 -藁 -呤 -卤 -嘻 -糅 -逛 -侵 -郓 -酌 -德 -摇 -※ -鬃 -被 -慨 -殡 -羸 -昌 -泡 -戛 -鞋 -河 -宪 -沿 -玲 -鲨 -翅 -哽 -源 -铅 -语 -照 -邯 -址 -荃 -佬 -顺 -鸳 -町 -霭 -睾 -瓢 -夸 -椁 -晓 -酿 -痈 -咔 -侏 -券 -噎 -湍 -签 -嚷 -离 -午 -尚 -社 -锤 -背 -孟 -使 -浪 -缦 -潍 -鞅 -军 -姹 -驶 -笑 -鳟 -鲁 -》 -孽 -钜 -绿 -洱 -礴 -焯 -椰 -颖 -囔 -乌 -孔 -巴 -互 -性 -椽 -哞 -聘 -昨 -早 -暮 -胶 -炀 -隧 -低 -彗 -昝 -铁 -呓 -氽 -藉 -喔 -癖 -瑗 -姨 -权 -胱 -韦 -堑 -蜜 -酋 -楝 -砝 -毁 -靓 -歙 -锲 -究 -屋 -喳 -骨 -辨 -碑 -武 -鸠 -宫 -辜 -烊 -适 -坡 -殃 -培 -佩 -供 -走 -蜈 -迟 -翼 -况 -姣 -凛 -浔 -吃 -飘 -债 -犟 -金 -促 -苛 -崇 -坂 -莳 -畔 -绂 -兵 -蠕 -斋 -根 -砍 -亢 -欢 -恬 -崔 -剁 -餐 -榫 -快 -扶 -‖ -濒 -缠 -鳜 -当 -彭 -驭 -浦 -篮 -昀 -锆 -秸 -钳 -弋 -娣 -瞑 -夷 -龛 -苫 -拱 -致 -% -嵊 -障 -隐 -弑 -初 -娓 -抉 -汩 -累 -蓖 -" -唬 -助 -苓 -昙 -押 -毙 -破 -城 -郧 -逢 -嚏 -獭 -瞻 -溱 -婿 -赊 -跨 -恼 -璧 -萃 -姻 -貉 -灵 -炉 -密 -氛 -陶 -砸 -谬 -衔 -点 -琛 -沛 -枳 -层 -岱 -诺 -脍 -榈 -埂 -征 -冷 -裁 -打 -蹴 -素 -瘘 -逞 -蛐 -聊 -激 -腱 -萘 -踵 -飒 -蓟 -吆 -取 -咙 -簋 -涓 -矩 -曝 -挺 -揣 -座 -你 -史 -舵 -焱 -尘 -苏 -笈 -脚 -溉 -榨 -诵 -樊 -邓 -焊 -义 -庶 -儋 -蟋 -蒲 -赦 -呷 -杞 -诠 -豪 -还 -试 -颓 -茉 -太 -除 -紫 -逃 -痴 -草 -充 -鳕 -珉 -祗 -墨 -渭 -烩 -蘸 -慕 -璇 -镶 -穴 -嵘 -恶 -骂 -险 -绋 -幕 -碉 -肺 -戳 -刘 -潞 -秣 -纾 -潜 -銮 -洛 -须 -罘 -销 -瘪 -汞 -兮 -屉 -r -林 -厕 -质 -探 -划 -狸 -殚 -善 -煊 -烹 -〒 -锈 -逯 -宸 -辍 -泱 -柚 -袍 -远 -蹋 -嶙 -绝 -峥 -娥 -缍 -雀 -徵 -认 -镱 -谷 -= -贩 -勉 -撩 -鄯 -斐 -洋 -非 -祚 -泾 -诒 -饿 -撬 -威 -晷 -搭 -芍 -锥 -笺 -蓦 -候 -琊 -档 -礁 -沼 -卵 -荠 -忑 -朝 -凹 -瑞 -头 -仪 -弧 -孵 -畏 -铆 -突 -衲 -车 -浩 -气 -茂 -悖 -厢 -枕 -酝 -戴 -湾 -邹 -飚 -攘 -锂 -写 -宵 -翁 -岷 -无 -喜 -丈 -挑 -嗟 -绛 -殉 -议 -槽 -具 -醇 -淞 -笃 -郴 -阅 -饼 -底 -壕 -砚 -弈 -询 -缕 -庹 -翟 -零 -筷 -暨 -舟 -闺 -甯 -撞 -麂 -茌 -蔼 -很 -珲 -捕 -棠 -角 -阉 -媛 -娲 -诽 -剿 -尉 -爵 -睬 -韩 -诰 -匣 -危 -糍 -镯 -立 -浏 -阳 -少 -盆 -舔 -擘 -匪 -申 -尬 -铣 -旯 -抖 -赘 -瓯 -居 -ˇ -哮 -游 -锭 -茏 -歌 -坏 -甚 -秒 -舞 -沙 -仗 -劲 -潺 -阿 -燧 -郭 -嗖 -霏 -忠 -材 -奂 -耐 -跺 -砀 -输 -岖 -媳 -氟 -极 -摆 -灿 -今 -扔 -腻 -枝 -奎 -药 -熄 -吨 -话 -q -额 -慑 -嘌 -协 -喀 -壳 -埭 -视 -著 -於 -愧 -陲 -翌 -峁 -颅 -佛 -腹 -聋 -侯 -咎 -叟 -秀 -颇 -存 -较 -罪 -哄 -岗 -扫 -栏 -钾 -羌 -己 -璨 -枭 -霉 -煌 -涸 -衿 -键 -镝 -益 -岢 -奏 -连 -夯 -睿 -冥 -均 -糖 -狞 -蹊 -稻 -爸 -刿 -胥 -煜 -丽 -肿 -璃 -掸 -跚 -灾 -垂 -樾 -濑 -乎 -莲 -窄 -犹 -撮 -战 -馄 -软 -络 -显 -鸢 -胸 -宾 -妲 -恕 -埔 -蝌 -份 -遇 -巧 -瞟 -粒 -恰 -剥 -桡 -博 -讯 -凯 -堇 -阶 -滤 -卖 -斌 -骚 -彬 -兑 -磺 -樱 -舷 -两 -娱 -福 -仃 -差 -找 -桁 -÷ -净 -把 -阴 -污 -戬 -雷 -碓 -蕲 -楚 -罡 -焖 -抽 -妫 -咒 -仑 -闱 -尽 -邑 -菁 -爱 -贷 -沥 -鞑 -牡 -嗉 -崴 -骤 -塌 -嗦 -订 -拮 -滓 -捡 -锻 -次 -坪 -杩 -臃 -箬 -融 -珂 -鹗 -宗 -枚 -降 -鸬 -妯 -阄 -堰 -盐 -毅 -必 -杨 -崃 -俺 -甬 -状 -莘 -货 -耸 -菱 -腼 -铸 -唏 -痤 -孚 -澳 -懒 -溅 -翘 -疙 -杷 -淼 -缙 -骰 -喊 -悉 -砻 -坷 -艇 -赁 -界 -谤 -纣 -宴 -晃 -茹 -归 -饭 -梢 -铡 -街 -抄 -肼 -鬟 -苯 -颂 -撷 -戈 -炒 -咆 -茭 -瘙 -负 -仰 -客 -琉 -铢 -封 -卑 -珥 -椿 -镧 -窨 -鬲 -寿 -御 -袤 -铃 -萎 -砖 -餮 -脒 -裳 -肪 -孕 -嫣 -馗 -嵇 -恳 -氯 -江 -石 -褶 -冢 -祸 -阻 -狈 -羞 -银 -靳 -透 -咳 -叼 -敷 -芷 -啥 -它 -瓤 -兰 -痘 -懊 -逑 -肌 -往 -捺 -坊 -甩 -呻 -〃 -沦 -忘 -膻 -祟 -菅 -剧 -崆 -智 -坯 -臧 -霍 -墅 -攻 -眯 -倘 -拢 -骠 -铐 -庭 -岙 -瓠 -′ -缺 -泥 -迢 -捶 -? -? -郏 -喙 -掷 -沌 -纯 -秘 -种 -听 -绘 -固 -螨 -团 -香 -盗 -妒 -埚 -蓝 -拖 -旱 -荞 -铀 -血 -遏 -汲 -辰 -叩 -拽 -幅 -硬 -惶 -桀 -漠 -措 -泼 -唑 -齐 -肾 -念 -酱 -虚 -屁 -耶 -旗 -砦 -闵 -婉 -馆 -拭 -绅 -韧 -忏 -窝 -醋 -葺 -顾 -辞 -倜 -堆 -辋 -逆 -玟 -贱 -疾 -董 -惘 -倌 -锕 -淘 -嘀 -莽 -俭 -笏 -绑 -鲷 -杈 -择 -蟀 -粥 -嗯 -驰 -逾 -案 -谪 -褓 -胫 -哩 -昕 -颚 -鲢 -绠 -躺 -鹄 -崂 -儒 -俨 -丝 -尕 -泌 -啊 -萸 -彰 -幺 -吟 -骄 -苣 -弦 -脊 -瑰 -〈 -诛 -镁 -析 -闪 -剪 -侧 -哟 -框 -螃 -守 -嬗 -燕 -狭 -铈 -缮 -概 -迳 -痧 -鲲 -俯 -售 -笼 -痣 -扉 -挖 -满 -咋 -援 -邱 -扇 -歪 -便 -玑 -绦 -峡 -蛇 -叨 -〖 -泽 -胃 -斓 -喋 -怂 -坟 -猪 -该 -蚬 -炕 -弥 -赞 -棣 -晔 -娠 -挲 -狡 -创 -疖 -铕 -镭 -稷 -挫 -弭 -啾 -翔 -粉 -履 -苘 -哦 -楼 -秕 -铂 -土 -锣 -瘟 -挣 -栉 -习 -享 -桢 -袅 -磨 -桂 -谦 -延 -坚 -蔚 -噗 -署 -谟 -猬 -钎 -恐 -嬉 -雒 -倦 -衅 -亏 -璩 -睹 -刻 -殿 -王 -算 -雕 -麻 -丘 -柯 -骆 -丸 -塍 -谚 -添 -鲈 -垓 -桎 -蚯 -芥 -予 -飕 -镦 -谌 -窗 -醚 -菀 -亮 -搪 -莺 -蒿 -羁 -足 -J -真 -轶 -悬 -衷 -靛 -翊 -掩 -哒 -炅 -掐 -冼 -妮 -l -谐 -稚 -荆 -擒 -犯 -陵 -虏 -浓 -崽 -刍 -陌 -傻 -孜 -千 -靖 -演 -矜 -钕 -煽 -杰 -酗 -渗 -伞 -栋 -俗 -泫 -戍 -罕 -沾 -疽 -灏 -煦 -芬 -磴 -叱 -阱 -榉 -湃 -蜀 -叉 -醒 -彪 -租 -郡 -篷 -屎 -良 -垢 -隗 -弱 -陨 -峪 -砷 -掴 -颁 -胎 -雯 -绵 -贬 -沐 -撵 -隘 -篙 -暖 -曹 -陡 -栓 -填 -臼 -彦 -瓶 -琪 -潼 -哪 -鸡 -摩 -啦 -俟 -锋 -域 -耻 -蔫 -疯 -纹 -撇 -毒 -绶 -痛 -酯 -忍 -爪 -赳 -歆 -嘹 -辕 -烈 -册 -朴 -钱 -吮 -毯 -癜 -娃 -谀 -邵 -厮 -炽 -璞 -邃 -丐 -追 -词 -瓒 -忆 -轧 -芫 -谯 -喷 -弟 -半 -冕 -裙 -掖 -墉 -绮 -寝 -苔 -势 -顷 -褥 -切 -衮 -君 -佳 -嫒 -蚩 -霞 -佚 -洙 -逊 -镖 -暹 -唛 -& -殒 -顶 -碗 -獗 -轭 -铺 -蛊 -废 -恹 -汨 -崩 -珍 -那 -杵 -曲 -纺 -夏 -薰 -傀 -闳 -淬 -姘 -舀 -拧 -卷 -楂 -恍 -讪 -厩 -寮 -篪 -赓 -乘 -灭 -盅 -鞣 -沟 -慎 -挂 -饺 -鼾 -杳 -树 -缨 -丛 -絮 -娌 -臻 -嗳 -篡 -侩 -述 -衰 -矛 -圈 -蚜 -匕 -筹 -匿 -濞 -晨 -叶 -骋 -郝 -挚 -蚴 -滞 -增 -侍 -描 -瓣 -吖 -嫦 -蟒 -匾 -圣 -赌 -毡 -癞 -恺 -百 -曳 -需 -篓 -肮 -庖 -帏 -卿 -驿 -遗 -蹬 -鬓 -骡 -歉 -芎 -胳 -屐 -禽 -烦 -晌 -寄 -媾 -狄 -翡 -苒 -船 -廉 -终 -痞 -殇 -々 -畦 -饶 -改 -拆 -悻 -萄 -£ -瓿 -乃 -訾 -桅 -匮 -溧 -拥 -纱 -铍 -骗 -蕃 -龋 -缬 -父 -佐 -疚 -栎 -醍 -掳 -蓄 -x -惆 -颜 -鲆 -榆 -〔 -猎 -敌 -暴 -谥 -鲫 -贾 -罗 -玻 -缄 -扦 -芪 -癣 -落 -徒 -臾 -恿 -猩 -托 -邴 -肄 -牵 -春 -陛 -耀 -刊 -拓 -蓓 -邳 -堕 -寇 -枉 -淌 -啡 -湄 -兽 -酷 -萼 -碚 -濠 -萤 -夹 -旬 -戮 -梭 -琥 -椭 -昔 -勺 -蜊 -绐 -晚 -孺 -僵 -宣 -摄 -冽 -旨 -萌 -忙 -蚤 -眉 -噼 -蟑 -付 -契 -瓜 -悼 -颡 -壁 -曾 -窕 -颢 -澎 -仿 -俑 -浑 -嵌 -浣 -乍 -碌 -褪 -乱 -蔟 -隙 -玩 -剐 -葫 -箫 -纲 -围 -伐 -决 -伙 -漩 -瑟 -刑 -肓 -镳 -缓 -蹭 -氨 -皓 -典 -畲 -坍 -铑 -檐 -塑 -洞 -倬 -储 -胴 -淳 -戾 -吐 -灼 -惺 -妙 -毕 -珐 -缈 -虱 -盖 -羰 -鸿 -磅 -谓 -髅 -娴 -苴 -唷 -蚣 -霹 -抨 -贤 -唠 -犬 -誓 -逍 -庠 -逼 -麓 -籼 -釉 -呜 -碧 -秧 -氩 -摔 -霄 -穸 -纨 -辟 -妈 -映 -完 -牛 -缴 -嗷 -炊 -恩 -荔 -茆 -掉 -紊 -慌 -莓 -羟 -阙 -萁 -磐 -另 -蕹 -辱 -鳐 -湮 -吡 -吩 -唐 -睦 -垠 -舒 -圜 -冗 -瞿 -溺 -芾 -囱 -匠 -僳 -汐 -菩 -饬 -漓 -黑 -霰 -浸 -濡 -窥 -毂 -蒡 -兢 -驻 -鹉 -芮 -诙 -迫 -雳 -厂 -忐 -臆 -猴 -鸣 -蚪 -栈 -箕 -羡 -渐 -莆 -捍 -眈 -哓 -趴 -蹼 -埕 -嚣 -骛 -宏 -淄 -斑 -噜 -严 -瑛 -垃 -椎 -诱 -压 -庾 -绞 -焘 -廿 -抡 -迄 -棘 -夫 -纬 -锹 -眨 -瞌 -侠 -脐 -竞 -瀑 -孳 -骧 -遁 -姜 -颦 -荪 -滚 -萦 -伪 -逸 -粳 -爬 -锁 -矣 -役 -趣 -洒 -颔 -诏 -逐 -奸 -甭 -惠 -攀 -蹄 -泛 -尼 -拼 -阮 -鹰 -亚 -颈 -惑 -勒 -〉 -际 -肛 -爷 -刚 -钨 -丰 -养 -冶 -鲽 -辉 -蔻 -画 -覆 -皴 -妊 -麦 -返 -醉 -皂 -擀 -〗 -酶 -凑 -粹 -悟 -诀 -硖 -港 -卜 -z -杀 -涕 -± -舍 -铠 -抵 -弛 -段 -敝 -镐 -奠 -拂 -轴 -跛 -袱 -e -t -沉 -菇 -俎 -薪 -峦 -秭 -蟹 -历 -盟 -菠 -寡 -液 -肢 -喻 -染 -裱 -悱 -抱 -氙 -赤 -捅 -猛 -跑 -氮 -谣 -仁 -尺 -辊 -窍 -烙 -衍 -架 -擦 -倏 -璐 -瑁 -币 -楞 -胖 -夔 -趸 -邛 -惴 -饕 -虔 -蝎 -§ -哉 -贝 -宽 -辫 -炮 -扩 -饲 -籽 -魏 -菟 -锰 -伍 -猝 -末 -琳 -哚 -蛎 -邂 -呀 -姿 -鄞 -却 -歧 -仙 -恸 -椐 -森 -牒 -寤 -袒 -婆 -虢 -雅 -钉 -朵 -贼 -欲 -苞 -寰 -故 -龚 -坭 -嘘 -咫 -礼 -硷 -兀 -睢 -汶 -’ -铲 -烧 -绕 -诃 -浃 -钿 -哺 -柜 -讼 -颊 -璁 -腔 -洽 -咐 -脲 -簌 -筠 -镣 -玮 -鞠 -谁 -兼 -姆 -挥 -梯 -蝴 -谘 -漕 -刷 -躏 -宦 -弼 -b -垌 -劈 -麟 -莉 -揭 -笙 -渎 -仕 -嗤 -仓 -配 -怏 -抬 -错 -泯 -镊 -孰 -猿 -邪 -仍 -秋 -鼬 -壹 -歇 -吵 -炼 -< -尧 -射 -柬 -廷 -胧 -霾 -凳 -隋 -肚 -浮 -梦 -祥 -株 -堵 -退 -L -鹫 -跎 -凶 -毽 -荟 -炫 -栩 -玳 -甜 -沂 -鹿 -顽 -伯 -爹 -赔 -蛴 -徐 -匡 -欣 -狰 -缸 -雹 -蟆 -疤 -默 -沤 -啜 -痂 -衣 -禅 -w -i -h -辽 -葳 -黝 -钗 -停 -沽 -棒 -馨 -颌 -肉 -吴 -硫 -悯 -劾 -娈 -马 -啧 -吊 -悌 -镑 -峭 -帆 -瀣 -涉 -咸 -疸 -滋 -泣 -翦 -拙 -癸 -钥 -蜒 -+ -尾 -庄 -凝 -泉 -婢 -渴 -谊 -乞 -陆 -锉 -糊 -鸦 -淮 -I -B -N -晦 -弗 -乔 -庥 -葡 -尻 -席 -橡 -傣 -渣 -拿 -惩 -麋 -斛 -缃 -矮 -蛏 -岘 -鸽 -姐 -膏 -催 -奔 -镒 -喱 -蠡 -摧 -钯 -胤 -柠 -拐 -璋 -鸥 -卢 -荡 -倾 -^ -_ -珀 -逄 -萧 -塾 -掇 -贮 -笆 -聂 -圃 -冲 -嵬 -M -滔 -笕 -值 -炙 -偶 -蜱 -搐 -梆 -汪 -蔬 -腑 -鸯 -蹇 -敞 -绯 -仨 -祯 -谆 -梧 -糗 -鑫 -啸 -豺 -囹 -猾 -巢 -柄 -瀛 -筑 -踌 -沭 -暗 -苁 -鱿 -蹉 -脂 -蘖 -牢 -热 -木 -吸 -溃 -宠 -序 -泞 -偿 -拜 -檩 -厚 -朐 -毗 -螳 -吞 -媚 -朽 -担 -蝗 -橘 -畴 -祈 -糟 -盱 -隼 -郜 -惜 -珠 -裨 -铵 -焙 -琚 -唯 -咚 -噪 -骊 -丫 -滢 -勤 -棉 -呸 -咣 -淀 -隔 -蕾 -窈 -饨 -挨 -煅 -短 -匙 -粕 -镜 -赣 -撕 -墩 -酬 -馁 -豌 -颐 -抗 -酣 -氓 -佑 -搁 -哭 -递 -耷 -涡 -桃 -贻 -碣 -截 -瘦 -昭 -镌 -蔓 -氚 -甲 -猕 -蕴 -蓬 -散 -拾 -纛 -狼 -猷 -铎 -埋 -旖 -矾 -讳 -囊 -糜 -迈 -粟 -蚂 -紧 -鲳 -瘢 -栽 -稼 -羊 -锄 -斟 -睁 -桥 -瓮 -蹙 -祉 -醺 -鼻 -昱 -剃 -跳 -篱 -跷 -蒜 -翎 -宅 -晖 -嗑 -壑 -峻 -癫 -屏 -狠 -陋 -袜 -途 -憎 -祀 -莹 -滟 -佶 -溥 -臣 -约 -盛 -峰 -磁 -慵 -婪 -拦 -莅 -朕 -鹦 -粲 -裤 -哎 -疡 -嫖 -琵 -窟 -堪 -谛 -嘉 -儡 -鳝 -斩 -郾 -驸 -酊 -妄 -胜 -贺 -徙 -傅 -噌 -钢 -栅 -庇 -恋 -匝 -巯 -邈 -尸 -锚 -粗 -佟 -蛟 -薹 -纵 -蚊 -郅 -绢 -锐 -苗 -俞 -篆 -淆 -膀 -鲜 -煎 -诶 -秽 -寻 -涮 -刺 -怀 -噶 -巨 -褰 -魅 -灶 -灌 -桉 -藕 -谜 -舸 -薄 -搀 -恽 -借 -牯 -痉 -渥 -愿 -亓 -耘 -杠 -柩 -锔 -蚶 -钣 -珈 -喘 -蹒 -幽 -赐 -稗 -晤 -莱 -泔 -扯 -肯 -菪 -裆 -腩 -豉 -疆 -骜 -腐 -倭 -珏 -唔 -粮 -亡 -润 -慰 -伽 -橄 -玄 -誉 -醐 -胆 -龊 -粼 -塬 -陇 -彼 -削 -嗣 -绾 -芽 -妗 -垭 -瘴 -爽 -薏 -寨 -龈 -泠 -弹 -赢 -漪 -猫 -嘧 -涂 -恤 -圭 -茧 -烽 -屑 -痕 -巾 -赖 -荸 -凰 -腮 -畈 -亵 -蹲 -偃 -苇 -澜 -艮 -换 -骺 -烘 -苕 -梓 -颉 -肇 -哗 -悄 -氤 -涠 -葬 -屠 -鹭 -植 -竺 -佯 -诣 -鲇 -瘀 -鲅 -邦 -移 -滁 -冯 -耕 -癔 -戌 -茬 -沁 -巩 -悠 -湘 -洪 -痹 -锟 -循 -谋 -腕 -鳃 -钠 -捞 -焉 -迎 -碱 -伫 -急 -榷 -奈 -邝 -卯 -辄 -皲 -卟 -醛 -畹 -忧 -稳 -雄 -昼 -缩 -阈 -睑 -扌 -耗 -曦 -涅 -捏 -瞧 -邕 -淖 -漉 -铝 -耦 -禹 -湛 -喽 -莼 -琅 -诸 -苎 -纂 -硅 -始 -嗨 -傥 -燃 -臂 -赅 -嘈 -呆 -贵 -屹 -壮 -肋 -亍 -蚀 -卅 -豹 -腆 -邬 -迭 -浊 -} -童 -螂 -捐 -圩 -勐 -触 -寞 -汊 -壤 -荫 -膺 -渌 -芳 -懿 -遴 -螈 -泰 -蓼 -蛤 -茜 -舅 -枫 -朔 -膝 -眙 -避 -梅 -判 -鹜 -璜 -牍 -缅 -垫 -藻 -黔 -侥 -惚 -懂 -踩 -腰 -腈 -札 -丞 -唾 -慈 -顿 -摹 -荻 -琬 -~ -斧 -沈 -滂 -胁 -胀 -幄 -莜 -Z -匀 -鄄 -掌 -绰 -茎 -焚 -赋 -萱 -谑 -汁 -铒 -瞎 -夺 -蜗 -野 -娆 -冀 -弯 -篁 -懵 -灞 -隽 -芡 -脘 -俐 -辩 -芯 -掺 -喏 -膈 -蝈 -觐 -悚 -踹 -蔗 -熠 -鼠 -呵 -抓 -橼 -峨 -畜 -缔 -禾 -崭 -弃 -熊 -摒 -凸 -拗 -穹 -蒙 -抒 -祛 -劝 -闫 -扳 -阵 -醌 -踪 -喵 -侣 -搬 -仅 -荧 -赎 -蝾 -琦 -买 -婧 -瞄 -寓 -皎 -冻 -赝 -箩 -莫 -瞰 -郊 -笫 -姝 -筒 -枪 -遣 -煸 -袋 -舆 -痱 -涛 -母 -〇 -启 -践 -耙 -绲 -盘 -遂 -昊 -搞 -槿 -诬 -纰 -泓 -惨 -檬 -亻 -越 -C -o -憩 -熵 -祷 -钒 -暧 -塔 -阗 -胰 -咄 -娶 -魔 -琶 -钞 -邻 -扬 -杉 -殴 -咽 -弓 -〆 -髻 -】 -吭 -揽 -霆 -拄 -殖 -脆 -彻 -岩 -芝 -勃 -辣 -剌 -钝 -嘎 -甄 -佘 -皖 -伦 -授 -徕 -憔 -挪 -皇 -庞 -稔 -芜 -踏 -溴 -兖 -卒 -擢 -饥 -鳞 -煲 -‰ -账 -颗 -叻 -斯 -捧 -鳍 -琮 -讹 -蛙 -纽 -谭 -酸 -兔 -莒 -睇 -伟 -觑 -羲 -嗜 -宜 -褐 -旎 -辛 -卦 -诘 -筋 -鎏 -溪 -挛 -熔 -阜 -晰 -鳅 -丢 -奚 -灸 -呱 -献 -陉 -黛 -鸪 -甾 -萨 -疮 -拯 -洲 -疹 -辑 -叙 -恻 -谒 -允 -柔 -烂 -氏 -逅 -漆 -拎 -惋 -扈 -湟 -纭 -啕 -掬 -擞 -哥 -忽 -涤 -鸵 -靡 -郗 -瓷 -扁 -廊 -怨 -雏 -钮 -敦 -E -懦 -憋 -汀 -拚 -啉 -腌 -岸 -f -痼 -瞅 -尊 -咀 -眩 -飙 -忌 -仝 -迦 -熬 -毫 -胯 -篑 -茄 -腺 -凄 -舛 -碴 -锵 -诧 -羯 -後 -漏 -汤 -宓 -仞 -蚁 -壶 -谰 -皑 -铄 -棰 -罔 -辅 -晶 -苦 -牟 -闽 -\ -烃 -饮 -聿 -丙 -蛳 -朱 -煤 -涔 -鳖 -犁 -罐 -荼 -砒 -淦 -妤 -黏 -戎 -孑 -婕 -瑾 -戢 -钵 -枣 -捋 -砥 -衩 -狙 -桠 -稣 -阎 -肃 -梏 -诫 -孪 -昶 -婊 -衫 -嗔 -侃 -塞 -蜃 -樵 -峒 -貌 -屿 -欺 -缫 -阐 -栖 -诟 -珞 -荭 -吝 -萍 -嗽 -恂 -啻 -蜴 -磬 -峋 -俸 -豫 -谎 -徊 -镍 -韬 -魇 -晴 -U -囟 -猜 -蛮 -坐 -囿 -伴 -亭 -肝 -佗 -蝠 -妃 -胞 -滩 -榴 -氖 -垩 -苋 -砣 -扪 -馏 -姓 -轩 -厉 -夥 -侈 -禀 -垒 -岑 -赏 -钛 -辐 -痔 -披 -纸 -碳 -“ -坞 -蠓 -挤 -荥 -沅 -悔 -铧 -帼 -蒌 -蝇 -a -p -y -n -g -哀 -浆 -瑶 -凿 -桶 -馈 -皮 -奴 -苜 -佤 -伶 -晗 -铱 -炬 -优 -弊 -氢 -恃 -甫 -攥 -端 -锌 -灰 -稹 -炝 -曙 -邋 -亥 -眶 -碾 -拉 -萝 -绔 -捷 -浍 -腋 -姑 -菖 -凌 -涞 -麽 -锢 -桨 -潢 -绎 -镰 -殆 -锑 -渝 -铬 -困 -绽 -觎 -匈 -糙 -暑 -裹 -鸟 -盔 -肽 -迷 -綦 -『 -亳 -佝 -俘 -钴 -觇 -骥 -仆 -疝 -跪 -婶 -郯 -瀹 -唉 -脖 -踞 -针 -晾 -忒 -扼 -瞩 -叛 -椒 -疟 -嗡 -邗 -肆 -跆 -玫 -忡 -捣 -咧 -唆 -艄 -蘑 -潦 -笛 -阚 -沸 -泻 -掊 -菽 -贫 -斥 -髂 -孢 -镂 -赂 -麝 -鸾 -屡 -衬 -苷 -恪 -叠 -希 -粤 -爻 -喝 -茫 -惬 -郸 -绻 -庸 -撅 -碟 -宄 -妹 -膛 -叮 -饵 -崛 -嗲 -椅 -冤 -搅 -咕 -敛 -尹 -垦 -闷 -蝉 -霎 -勰 -败 -蓑 -泸 -肤 -鹌 -幌 -焦 -浠 -鞍 -刁 -舰 -乙 -竿 -裔 -。 -茵 -函 -伊 -兄 -丨 -娜 -匍 -謇 -莪 -宥 -似 -蝽 -翳 -酪 -翠 -粑 -薇 -祢 -骏 -赠 -叫 -Q -噤 -噻 -竖 -芗 -莠 -潭 -俊 -羿 -耜 -O -郫 -趁 -嗪 -囚 -蹶 -芒 -洁 -笋 -鹑 -敲 -硝 -啶 -堡 -渲 -揩 -』 -携 -宿 -遒 -颍 -扭 -棱 -割 -萜 -蔸 -葵 -琴 -捂 -饰 -衙 -耿 -掠 -募 -岂 -窖 -涟 -蔺 -瘤 -柞 -瞪 -怜 -匹 -距 -楔 -炜 -哆 -秦 -缎 -幼 -茁 -绪 -痨 -恨 -楸 -娅 -瓦 -桩 -雪 -嬴 -伏 -榔 -妥 -铿 -拌 -眠 -雍 -缇 -‘ -卓 -搓 -哌 -觞 -噩 -屈 -哧 -髓 -咦 -巅 -娑 -侑 -淫 -膳 -祝 -勾 -姊 -莴 -胄 -疃 -薛 -蜷 -胛 -巷 -芙 -芋 -熙 -闰 -勿 -窃 -狱 -剩 -钏 -幢 -陟 -铛 -慧 -靴 -耍 -k -浙 -浇 -飨 -惟 -绗 -祜 -澈 -啼 -咪 -磷 -摞 -诅 -郦 -抹 -跃 -壬 -吕 -肖 -琏 -颤 -尴 -剡 -抠 -凋 -赚 -泊 -津 -宕 -殷 -倔 -氲 -漫 -邺 -涎 -怠 -$ -垮 -荬 -遵 -俏 -叹 -噢 -饽 -蜘 -孙 -筵 -疼 -鞭 -羧 -牦 -箭 -潴 -c -眸 -祭 -髯 -啖 -坳 -愁 -芩 -驮 -倡 -巽 -穰 -沃 -胚 -怒 -凤 -槛 -剂 -趵 -嫁 -v -邢 -灯 -鄢 -桐 -睽 -檗 -锯 -槟 -婷 -嵋 -圻 -诗 -蕈 -颠 -遭 -痢 -芸 -怯 -馥 -竭 -锗 -徜 -恭 -遍 -籁 -剑 -嘱 -苡 -龄 -僧 -桑 -潸 -弘 -澶 -楹 -悲 -讫 -愤 -腥 -悸 -谍 -椹 -呢 -桓 -葭 -攫 -阀 -翰 -躲 -敖 -柑 -郎 -笨 -橇 -呃 -魁 -燎 -脓 -葩 -磋 -垛 -玺 -狮 -沓 -砜 -蕊 -锺 -罹 -蕉 -翱 -虐 -闾 -巫 -旦 -茱 -嬷 -枯 -鹏 -贡 -芹 -汛 -矫 -绁 -拣 -禺 -佃 -讣 -舫 -惯 -乳 -趋 -疲 -挽 -岚 -虾 -衾 -蠹 -蹂 -飓 -氦 -铖 -孩 -稞 -瑜 -壅 -掀 -勘 -妓 -畅 -髋 -W -庐 -牲 -蓿 -榕 -练 -垣 -唱 -邸 -菲 -昆 -婺 -穿 -绡 -麒 -蚱 -掂 -愚 -泷 -涪 -漳 -妩 -娉 -榄 -讷 -觅 -旧 -藤 -煮 -呛 -柳 -腓 -叭 -庵 -烷 -阡 -罂 -蜕 -擂 -猖 -咿 -媲 -脉 -【 -沏 -貅 -黠 -熏 -哲 -烁 -坦 -酵 -兜 -× -潇 -撒 -剽 -珩 -圹 -乾 -摸 -樟 -帽 -嗒 -襄 -魂 -轿 -憬 -锡 -〕 -喃 -皆 -咖 -隅 -脸 -残 -泮 -袂 -鹂 -珊 -囤 -捆 -咤 -误 -徨 -闹 -淙 -芊 -淋 -怆 -囗 -拨 -梳 -渤 -R -G -绨 -蚓 -婀 -幡 -狩 -麾 -谢 -唢 -裸 -旌 -伉 -纶 -裂 -驳 -砼 -咛 -澄 -樨 -蹈 -宙 -澍 -倍 -貔 -操 -勇 -蟠 -摈 -砧 -虬 -够 -缁 -悦 -藿 -撸 -艹 -摁 -淹 -豇 -虎 -榭 -ˉ -吱 -d -° -喧 -荀 -踱 -侮 -奋 -偕 -饷 -犍 -惮 -坑 -璎 -徘 -宛 -妆 -袈 -倩 -窦 -昂 -荏 -乖 -K -怅 -撰 -鳙 -牙 -袁 -酞 -X -痿 -琼 -闸 -雁 -趾 -荚 -虻 -涝 -《 -杏 -韭 -偈 -烤 -绫 -鞘 -卉 -症 -遢 -蓥 -诋 -杭 -荨 -匆 -竣 -簪 -辙 -敕 -虞 -丹 -缭 -咩 -黟 -m -淤 -瑕 -咂 -铉 -硼 -茨 -嶂 -痒 -畸 -敬 -涿 -粪 -窘 -熟 -叔 -嫔 -盾 -忱 -裘 -憾 -梵 -赡 -珙 -咯 -娘 -庙 -溯 -胺 -葱 -痪 -摊 -荷 -卞 -乒 -髦 -寐 -铭 -坩 -胗 -枷 -爆 -溟 -嚼 -羚 -砬 -轨 -惊 -挠 -罄 -竽 -菏 -氧 -浅 -楣 -盼 -枢 -炸 -阆 -杯 -谏 -噬 -淇 -渺 -俪 -秆 -墓 -泪 -跻 -砌 -痰 -垡 -渡 -耽 -釜 -讶 -鳎 -煞 -呗 -韶 -舶 -绷 -鹳 -缜 -旷 -铊 -皱 -龌 -檀 -霖 -奄 -槐 -艳 -蝶 -旋 -哝 -赶 -骞 -蚧 -腊 -盈 -丁 -` -蜚 -矸 -蝙 -睨 -嚓 -僻 -鬼 -醴 -夜 -彝 -磊 -笔 -拔 -栀 -糕 -厦 -邰 -纫 -逭 -纤 -眦 -膊 -馍 -躇 -烯 -蘼 -冬 -诤 -暄 -骶 -哑 -瘠 -」 -臊 -丕 -愈 -咱 -螺 -擅 -跋 -搏 -硪 -谄 -笠 -淡 -嘿 -骅 -谧 -鼎 -皋 -姚 -歼 -蠢 -驼 -耳 -胬 -挝 -涯 -狗 -蒽 -孓 -犷 -凉 -芦 -箴 -铤 -孤 -嘛 -坤 -V -茴 -朦 -挞 -尖 -橙 -诞 -搴 -碇 -洵 -浚 -帚 -蜍 -漯 -柘 -嚎 -讽 -芭 -荤 -咻 -祠 -秉 -跖 -埃 -吓 -糯 -眷 -馒 -惹 -娼 -鲑 -嫩 -讴 -轮 -瞥 -靶 -褚 -乏 -缤 -宋 -帧 -删 -驱 -碎 -扑 -俩 -俄 -偏 -涣 -竹 -噱 -皙 -佰 -渚 -唧 -斡 -# -镉 -刀 -崎 -筐 -佣 -夭 -贰 -肴 -峙 -哔 -艿 -匐 -牺 -镛 -缘 -仡 -嫡 -劣 -枸 -堀 -梨 -簿 -鸭 -蒸 -亦 -稽 -浴 -{ -衢 -束 -槲 -j -阁 -揍 -疥 -棋 -潋 -聪 -窜 -乓 -睛 -插 -冉 -阪 -苍 -搽 -「 -蟾 -螟 -幸 -仇 -樽 -撂 -慢 -跤 -幔 -俚 -淅 -覃 -觊 -溶 -妖 -帛 -侨 -曰 -妾 -泗 -· -: -瀘 -風 -Ë -( -) -∶ -紅 -紗 -瑭 -雲 -頭 -鶏 -財 -許 -• -¥ -樂 -焗 -麗 -— -; -滙 -東 -榮 -繪 -興 -… -門 -業 -π -楊 -國 -顧 -é -盤 -寳 -Λ -龍 -鳳 -島 -誌 -緣 -結 -銭 -萬 -勝 -祎 -璟 -優 -歡 -臨 -時 -購 -= -★ -藍 -昇 -鐵 -觀 -勅 -農 -聲 -畫 -兿 -術 -發 -劉 -記 -專 -耑 -園 -書 -壴 -種 -Ο -● -褀 -號 -銀 -匯 -敟 -锘 -葉 -橪 -廣 -進 -蒄 -鑽 -阝 -祙 -貢 -鍋 -豊 -夬 -喆 -團 -閣 -開 -燁 -賓 -館 -酡 -沔 -順 -+ -硚 -劵 -饸 -陽 -車 -湓 -復 -萊 -氣 -軒 -華 -堃 -迮 -纟 -戶 -馬 -學 -裡 -電 -嶽 -獨 -マ -シ -サ -ジ -燘 -袪 -環 -❤ -臺 -灣 -専 -賣 -孖 -聖 -攝 -線 -▪ -α -傢 -俬 -夢 -達 -莊 -喬 -貝 -薩 -劍 -羅 -壓 -棛 -饦 -尃 -璈 -囍 -醫 -G -I -A -# -N -鷄 -髙 -嬰 -啓 -約 -隹 -潔 -賴 -藝 -~ -寶 -籣 -麺 -  -嶺 -√ -義 -網 -峩 -長 -∧ -魚 -機 -構 -② -鳯 -偉 -L -B -㙟 -畵 -鴿 -' -詩 -溝 -嚞 -屌 -藔 -佧 -玥 -蘭 -織 -1 -3 -9 -0 -7 -點 -砭 -鴨 -鋪 -銘 -廳 -弍 -‧ -創 -湯 -坶 -℃ -卩 -骝 -& -烜 -荘 -當 -潤 -扞 -係 -懷 -碶 -钅 -蚨 -讠 -☆ -叢 -爲 -埗 -涫 -塗 -→ -楽 -現 -鯨 -愛 -瑪 -鈺 -忄 -悶 -藥 -飾 -樓 -視 -孬 -ㆍ -燚 -苪 -師 -① -丼 -锽 -│ -韓 -標 -è -兒 -閏 -匋 -張 -漢 -Ü -髪 -會 -閑 -檔 -習 -裝 -の -峯 -菘 -輝 -И -雞 -釣 -億 -浐 -K -O -R -8 -H -E -P -T -W -D -S -C -M -F -姌 -饹 -» -晞 -廰 -ä -嵯 -鷹 -負 -飲 -絲 -冚 -楗 -澤 -綫 -區 -❋ -← -質 -靑 -揚 -③ -滬 -統 -産 -協 -﹑ -乸 -畐 -經 -運 -際 -洺 -岽 -為 -粵 -諾 -崋 -豐 -碁 -ɔ -V -2 -6 -齋 -誠 -訂 -´ -勑 -雙 -陳 -無 -í -泩 -媄 -夌 -刂 -i -c -t -o -r -a -嘢 -耄 -燴 -暃 -壽 -媽 -靈 -抻 -體 -唻 -É -冮 -甹 -鎮 -錦 -ʌ -蜛 -蠄 -尓 -駕 -戀 -飬 -逹 -倫 -貴 -極 -Я -Й -寬 -磚 -嶪 -郎 -職 -| -間 -n -d -剎 -伈 -課 -飛 -橋 -瘊 -№ -譜 -骓 -圗 -滘 -縣 -粿 -咅 -養 -濤 -彳 -® -% -Ⅱ -啰 -㴪 -見 -矞 -薬 -糁 -邨 -鲮 -顔 -罱 -З -選 -話 -贏 -氪 -俵 -競 -瑩 -繡 -枱 -β -綉 -á -獅 -爾 -™ -麵 -戋 -淩 -徳 -個 -劇 -場 -務 -簡 -寵 -h -實 -膠 -轱 -圖 -築 -嘣 -樹 -㸃 -營 -耵 -孫 -饃 -鄺 -飯 -麯 -遠 -輸 -坫 -孃 -乚 -閃 -鏢 -㎡ -題 -廠 -關 -↑ -爺 -將 -軍 -連 -篦 -覌 -參 -箸 -- -窠 -棽 -寕 -夀 -爰 -歐 -呙 -閥 -頡 -熱 -雎 -垟 -裟 -凬 -勁 -帑 -馕 -夆 -疌 -枼 -馮 -貨 -蒤 -樸 -彧 -旸 -靜 -龢 -暢 -㐱 -鳥 -珺 -鏡 -灡 -爭 -堷 -廚 -Ó -騰 -診 -┅ -蘇 -褔 -凱 -頂 -豕 -亞 -帥 -嘬 -⊥ -仺 -桖 -複 -饣 -絡 -穂 -顏 -棟 -納 -▏ -濟 -親 -設 -計 -攵 -埌 -烺 -ò -頤 -燦 -蓮 -撻 -節 -講 -濱 -濃 -娽 -洳 -朿 -燈 -鈴 -護 -膚 -铔 -過 -補 -Z -U -5 -4 -坋 -闿 -䖝 -餘 -缐 -铞 -貿 -铪 -桼 -趙 -鍊 -[ -㐂 -垚 -菓 -揸 -捲 -鐘 -滏 -𣇉 -爍 -輪 -燜 -鴻 -鮮 -動 -鹞 -鷗 -丄 -慶 -鉌 -翥 -飮 -腸 -⇋ -漁 -覺 -來 -熘 -昴 -翏 -鲱 -圧 -鄉 -萭 -頔 -爐 -嫚 -г -貭 -類 -聯 -幛 -輕 -訓 -鑒 -夋 -锨 -芃 -珣 -䝉 -扙 -嵐 -銷 -處 -ㄱ -語 -誘 -苝 -歸 -儀 -燒 -楿 -內 -粢 -葒 -奧 -麥 -礻 -滿 -蠔 -穵 -瞭 -態 -鱬 -榞 -硂 -鄭 -黃 -煙 -祐 -奓 -逺 -* -瑄 -獲 -聞 -薦 -讀 -這 -樣 -決 -問 -啟 -們 -執 -説 -轉 -單 -隨 -唘 -帶 -倉 -庫 -還 -贈 -尙 -皺 -■ -餅 -產 -○ -∈ -報 -狀 -楓 -賠 -琯 -嗮 -禮 -` -傳 -> -≤ -嗞 -Φ -≥ -換 -咭 -∣ -↓ -曬 -ε -応 -寫 -″ -終 -様 -純 -費 -療 -聨 -凍 -壐 -郵 -ü -黒 -∫ -製 -塊 -調 -軽 -確 -撃 -級 -馴 -Ⅲ -涇 -繹 -數 -碼 -證 -狒 -処 -劑 -< -晧 -賀 -衆 -] -櫥 -兩 -陰 -絶 -對 -鯉 -憶 -◎ -p -e -Y -蕒 -煖 -頓 -測 -試 -鼽 -僑 -碩 -妝 -帯 -≈ -鐡 -舖 -權 -喫 -倆 -ˋ -該 -悅 -ā -俫 -. -f -s -b -m -k -g -u -j -貼 -淨 -濕 -針 -適 -備 -l -/ -給 -謢 -強 -觸 -衛 -與 -⊙ -$ -緯 -變 -⑴ -⑵ -⑶ -㎏ -殺 -∩ -幚 -─ -價 -▲ -離 -ú -ó -飄 -烏 -関 -閟 -﹝ -﹞ -邏 -輯 -鍵 -驗 -訣 -導 -歷 -屆 -層 -▼ -儱 -錄 -熳 -ē -艦 -吋 -錶 -辧 -飼 -顯 -④ -禦 -販 -気 -対 -枰 -閩 -紀 -幹 -瞓 -貊 -淚 -△ -眞 -墊 -Ω -獻 -褲 -縫 -緑 -亜 -鉅 -餠 -{ -} -◆ -蘆 -薈 -█ -◇ -溫 -彈 -晳 -粧 -犸 -穩 -訊 -崬 -凖 -熥 -П -舊 -條 -紋 -圍 -Ⅳ -筆 -尷 -難 -雜 -錯 -綁 -識 -頰 -鎖 -艶 -□ -殁 -殼 -⑧ -├ -▕ -鵬 -ǐ -ō -ǒ -糝 -綱 -▎ -μ -盜 -饅 -醬 -籤 -蓋 -釀 -鹽 -據 -à -ɡ -辦 -◥ -彐 -┌ -婦 -獸 -鲩 -伱 -ī -蒟 -蒻 -齊 -袆 -腦 -寧 -凈 -妳 -煥 -詢 -偽 -謹 -啫 -鯽 -騷 -鱸 -損 -傷 -鎻 -髮 -買 -冏 -儥 -両 -﹢ -∞ -載 -喰 -z -羙 -悵 -燙 -曉 -員 -組 -徹 -艷 -痠 -鋼 -鼙 -縮 -細 -嚒 -爯 -≠ -維 -" -鱻 -壇 -厍 -帰 -浥 -犇 -薡 -軎 -² -應 -醜 -刪 -緻 -鶴 -賜 -噁 -軌 -尨 -镔 -鷺 -槗 -彌 -葚 -濛 -請 -溇 -緹 -賢 -訪 -獴 -瑅 -資 -縤 -陣 -蕟 -栢 -韻 -祼 -恁 -伢 -謝 -劃 -涑 -總 -衖 -踺 -砋 -凉 -籃 -駿 -苼 -瘋 -昽 -紡 -驊 -腎 -﹗ -響 -杋 -剛 -嚴 -禪 -歓 -槍 -傘 -檸 -檫 -炣 -勢 -鏜 -鎢 -銑 -尐 -減 -奪 -惡 -θ -僮 -婭 -臘 -ū -ì -殻 -鉄 -∑ -蛲 -焼 -緖 -續 -紹 -懮! -䰾 -䲁 -丌 -丏 -丟 -並 -乂 -乗 -乩 -乭 -乹 -亀 -亂 -亅 -亊 -亠 -亰 -亶 -亹 -仂 -仉 -仏 -仛 -仫 -仮 -仳 -仵 -仼 -伃 -伋 -伕 -伝 -伷 -伾 -佀 -佁 -佇 -佈 -佉 -佋 -佔 -併 -佹 -佺 -佾 -侁 -侅 -侊 -侖 -侘 -侚 -侞 -価 -侶 -侷 -侹 -俁 -俅 -俋 -俌 -俍 -俛 -俠 -俳 -俴 -俶 -俽 -倈 -倓 -倖 -倗 -倞 -倢 -倣 -値 -倧 -倮 -倻 -偁 -偊 -偍 -偓 -偪 -偲 -側 -偵 -偸 -傃 -傉 -傑 -傒 -傕 -傖 -傜 -傭 -債 -傾 -僅 -僉 -僊 -働 -僔 -僕 -僖 -僙 -僜 -僡 -僩 -僭 -僰 -僱 -僴 -儁 -儂 -儆 -儇 -儈 -儉 -儐 -儔 -儕 -儘 -儚 -儞 -償 -儦 -儫 -儲 -儷 -儺 -儻 -儼 -兌 -児 -兕 -兗 -兪 -冂 -円 -冇 -冊 -冑 -冖 -冧 -冨 -冪 -冫 -冴 -凃 -凜 -凞 -凪 -凵 -刄 -刎 -別 -刦 -刧 -刼 -則 -剋 -剏 -剝 -剣 -剮 -劄 -劊 -劌 -劔 -劬 -効 -劼 -勔 -勖 -勗 -勛 -勞 -勣 -勦 -勱 -勲 -勳 -勵 -勷 -勸 -勻 -匂 -匄 -匏 -匚 -匱 -匸 -卋 -卍 -卐 -卣 -卬 -卮 -卲 -卹 -卺 -卻 -卽 -厓 -厔 -厙 -厭 -厰 -厲 -厴 -厶 -叄 -収 -叕 -叡 -叵 -吔 -吥 -吳 -吶 -呂 -呉 -呎 -呾 -咁 -咑 -咗 -咘 -咟 -咥 -咲 -咼 -咾 -哂 -哏 -哐 -哖 -哱 -唃 -唄 -唫 -唭 -唵 -唸 -啁 -啍 -啚 -啞 -啣 -啯 -啱 -啲 -啷 -喈 -喚 -喢 -喦 -喪 -喲 -喼 -嗄 -嗆 -嗇 -嗊 -嗎 -嗚 -嗢 -嗩 -嗶 -嗹 -嘅 -嘆 -嘍 -嘏 -嘔 -嘗 -嘚 -嘜 -嘥 -嘩 -嘮 -嘯 -嘰 -嘸 -噍 -噏 -噓 -噝 -噠 -噥 -噦 -噯 -噰 -噲 -噴 -噸 -噹 -嚇 -嚈 -嚐 -嚕 -嚗 -嚙 -嚟 -嚤 -嚦 -嚧 -嚨 -嚩 -嚮 -嚳 -嚶 -嚿 -囀 -囂 -囃 -囉 -囑 -囒 -囓 -囝 -団 -囧 -囪 -囮 -囯 -囲 -図 -囶 -囷 -圂 -圄 -圉 -圏 -圓 -圪 -圯 -坌 -坖 -坣 -坬 -坮 -坵 -垈 -垍 -垕 -垞 -垯 -垰 -垵 -垻 -垿 -埅 -埇 -埈 -埏 -埒 -埜 -埡 -埤 -埧 -埨 -埪 -埮 -埴 -埵 -埻 -埼 -堅 -堈 -堉 -堊 -堍 -堖 -堝 -堦 -堮 -堯 -堺 -塀 -塅 -塆 -塋 -塏 -塙 -塜 -塡 -塢 -塤 -塨 -塩 -塭 -塰 -塱 -塲 -塵 -塹 -塽 -墀 -墎 -増 -墘 -墜 -墡 -墣 -墫 -墬 -墮 -墱 -墳 -墺 -墼 -墾 -壄 -壆 -壋 -壌 -壎 -壔 -壘 -壙 -壞 -壟 -壠 -壢 -壩 -壯 -壱 -壺 -変 -夊 -夠 -夤 -夾 -奀 -奐 -奣 -奩 -奫 -奭 -奮 -妀 -妁 -妏 -妑 -妠 -妧 -妭 -妸 -妺 -姀 -姁 -姃 -姈 -姉 -姍 -姦 -姪 -姫 -姮 -姵 -姶 -姸 -娋 -娍 -娎 -娖 -娛 -娫 -娳 -娸 -婁 -婑 -婯 -婻 -婼 -媃 -媊 -媐 -媓 -媖 -媗 -媜 -媞 -媧 -媭 -媯 -媺 -媼 -媿 -嫄 -嫈 -嫘 -嫪 -嫲 -嫳 -嫵 -嫺 -嫻 -嬅 -嬈 -嬋 -嬌 -嬛 -嬝 -嬡 -嬤 -嬨 -嬪 -嬬 -嬭 -嬸 -嬾 -嬿 -孀 -孆 -孋 -孌 -孮 -孻 -孿 -宍 -実 -宧 -宮 -寀 -寁 -寈 -寊 -寔 -寖 -寗 -寘 -寛 -寜 -寢 -審 -寯 -尋 -尗 -尢 -尪 -屄 -屇 -屍 -屓 -屚 -屜 -屢 -屬 -屭 -屺 -屻 -岀 -岈 -岡 -岣 -岧 -岪 -岬 -岰 -岵 -岻 -峅 -峇 -峍 -峘 -峚 -峠 -峴 -峼 -峽 -崁 -崈 -崍 -崐 -崑 -崒 -崗 -崘 -崙 -崚 -崞 -崟 -崠 -崢 -崱 -崵 -崶 -嵎 -嵒 -嵕 -嵖 -嵗 -嵙 -嵛 -嵜 -嵨 -嵮 -嵰 -嵴 -嵻 -嵿 -嶁 -嶃 -嶄 -嶇 -嶋 -嶌 -嶍 -嶒 -嶔 -嶗 -嶝 -嶠 -嶢 -嶦 -嶧 -嶬 -嶰 -嶲 -嶴 -嶷 -嶸 -嶼 -巂 -巄 -巆 -巋 -巌 -巎 -巑 -巒 -巔 -巖 -巘 -巛 -巰 -巶 -巻 -巿 -帔 -帙 -帡 -帢 -帳 -幀 -幃 -幗 -幟 -幣 -幪 -幫 -幵 -幷 -幾 -庀 -庁 -広 -庢 -庲 -庼 -廁 -廂 -廄 -廆 -廈 -廋 -廌 -廍 -廑 -廔 -廕 -廙 -廝 -廞 -廟 -廡 -廢 -廧 -廨 -廩 -廬 -廱 -廸 -廻 -廼 -弁 -弅 -弇 -弉 -弐 -弒 -弔 -弖 -弢 -弨 -弸 -弾 -彀 -彄 -彅 -彆 -彊 -彎 -彔 -彖 -彘 -彙 -彜 -彞 -彠 -彡 -彣 -彥 -彫 -彿 -徂 -徑 -從 -徠 -徧 -徫 -徬 -徭 -徴 -徸 -忉 -忝 -忞 -忬 -忯 -忳 -怍 -怙 -怛 -怵 -恆 -恊 -恥 -恵 -悆 -悛 -悝 -悞 -悧 -悪 -悰 -悳 -惇 -惔 -惣 -惱 -惲 -愃 -愆 -愍 -愐 -愒 -愔 -愜 -愨 -愭 -愴 -愷 -愼 -愾 -慄 -慘 -慚 -慜 -慟 -慣 -慥 -慮 -慳 -慾 -憂 -憊 -憍 -憐 -憑 -憓 -憕 -憙 -憚 -憤 -憫 -憲 -憺 -憻 -懃 -懇 -懌 -懍 -懐 -懣 -懮 -懲 -懶 -懸 -懺 -懼 -懽 -懾 -戇 -戔 -戕 -戙 -戡 -戥 -戦 -戩 -戰 -戱 -戲 -戸 -戻 -戽 -扆 -扥 -抃 -抇 -抦 -拋 -拏 -拝 -拡 -拺 -挙 -挵 -挹 -挻 -挾 -捒 -捜 -捦 -捨 -捩 -捫 -捭 -捱 -掃 -掄 -掙 -掛 -掞 -掟 -採 -掾 -揀 -揄 -揆 -揔 -揮 -揺 -搖 -搗 -搠 -搢 -搳 -搵 -搶 -搾 -摂 -摜 -摟 -摠 -摭 -摯 -摳 -摴 -摵 -摶 -摺 -摻 -摽 -撈 -撐 -撓 -撖 -撙 -撚 -撣 -撥 -撫 -撲 -撳 -撾 -撿 -擁 -擇 -擊 -擋 -擔 -擠 -擥 -擬 -擯 -擰 -擱 -擲 -擴 -擷 -擺 -擼 -擾 -攏 -攔 -攖 -攜 -攞 -攢 -攣 -攤 -攪 -攬 -攴 -攷 -攽 -敍 -敎 -敔 -敗 -敘 -敫 -敭 -敵 -敻 -敾 -斂 -斃 -斎 -斕 -斖 -斝 -斬 -斷 -斿 -旂 -旃 -旄 -旉 -旙 -旛 -旡 -旲 -旳 -旻 -旼 -旽 -旾 -旿 -昃 -昉 -昍 -昐 -昚 -昛 -昜 -昞 -昡 -昣 -昤 -昪 -昫 -昰 -昺 -晈 -晉 -晊 -晙 -晛 -晝 -晩 -晪 -晫 -晭 -晸 -暅 -暈 -暉 -暊 -暌 -暎 -暏 -暐 -暕 -暘 -暝 -暟 -暠 -暦 -暫 -暱 -暲 -暸 -暻 -暾 -曄 -曅 -曆 -曇 -曌 -曔 -曖 -曠 -曧 -曨 -曩 -曮 -曶 -曷 -曺 -曽 -朊 -朏 -朓 -朖 -朧 -朶 -杁 -杌 -杓 -杙 -杣 -杤 -杧 -杬 -杴 -杻 -杼 -枏 -枖 -枛 -枠 -枡 -枲 -枹 -柁 -柃 -柉 -柊 -柎 -柝 -柟 -柰 -柵 -柶 -柷 -査 -柾 -栃 -栄 -栐 -栒 -栜 -栝 -栞 -栨 -栲 -栴 -栻 -桄 -桕 -桙 -桜 -桝 -桫 -桱 -桲 -桴 -桿 -梀 -梂 -梃 -梉 -梔 -梘 -梟 -梠 -梣 -梫 -梱 -梶 -梽 -棄 -棆 -棐 -棓 -棖 -棗 -棡 -棧 -棨 -棩 -棪 -棫 -棲 -棶 -棹 -棻 -棼 -椆 -椇 -椏 -椙 -椥 -椪 -椲 -椵 -楙 -楡 -楢 -楤 -楧 -楨 -楫 -楮 -楯 -楳 -榊 -榍 -榎 -榑 -榖 -榗 -榘 -榢 -榣 -榤 -榦 -榲 -榿 -槀 -槁 -槃 -槊 -槓 -槔 -槙 -槤 -槩 -槭 -槰 -槱 -槳 -槺 -槻 -槼 -樀 -樁 -樅 -樆 -樋 -樑 -樗 -樘 -樞 -権 -樫 -樺 -樻 -橈 -橐 -橒 -橓 -橚 -橢 -橫 -橿 -檄 -檇 -檉 -檊 -檎 -檜 -檞 -檠 -檡 -檢 -檣 -檦 -檨 -檯 -檳 -檵 -檻 -檽 -櫂 -櫃 -櫆 -櫈 -櫓 -櫚 -櫛 -櫞 -櫟 -櫨 -櫪 -櫱 -櫸 -櫻 -櫾 -櫿 -欄 -欉 -欏 -欒 -欖 -欞 -欥 -欸 -欹 -欽 -歊 -歎 -歛 -歩 -歲 -歳 -歴 -歿 -殂 -殄 -殑 -殘 -殛 -殞 -殟 -殤 -殭 -殮 -殯 -殲 -殳 -毀 -毆 -毉 -毌 -毎 -毐 -毖 -毘 -毬 -毴 -毸 -毿 -氂 -氈 -氍 -氫 -氬 -氷 -氹 -氻 -氾 -汎 -汜 -汧 -汭 -沄 -沆 -沇 -沍 -沒 -沖 -沘 -沚 -沜 -沢 -沨 -沯 -沺 -況 -泂 -泆 -泇 -泐 -泖 -泚 -洌 -洎 -洢 -洣 -洤 -洨 -洩 -洸 -洹 -浄 -浛 -浞 -浟 -浡 -浤 -浯 -浵 -浹 -涙 -涼 -淍 -淎 -淏 -淓 -淛 -淠 -淥 -淪 -淯 -淰 -淵 -淶 -淸 -淺 -淽 -渃 -済 -渉 -渋 -渕 -渙 -渟 -渦 -渫 -渼 -渽 -渾 -湉 -湊 -湔 -湜 -湞 -湣 -湥 -湧 -湳 -湴 -湼 -満 -溁 -溈 -溋 -溎 -準 -溙 -溦 -溲 -溵 -溼 -滀 -滄 -滅 -滈 -滉 -滌 -滎 -滝 -滯 -滲 -滷 -滸 -滹 -滻 -滽 -滾 -漇 -漈 -漎 -漚 -漣 -漬 -漲 -漴 -漵 -漷 -漸 -漼 -漿 -潁 -潑 -潛 -潟 -潯 -潰 -潲 -潽 -潾 -潿 -澀 -澁 -澂 -澆 -澇 -澉 -澋 -澌 -澔 -澗 -澠 -澣 -澥 -澪 -澮 -澯 -澱 -澻 -濁 -濊 -濋 -濘 -濙 -濫 -濬 -濰 -濲 -濶 -濺 -濼 -濾 -瀁 -瀅 -瀆 -瀉 -瀍 -瀏 -瀔 -瀕 -瀝 -瀞 -瀟 -瀠 -瀦 -瀧 -瀨 -瀬 -瀰 -瀲 -瀴 -瀶 -瀾 -灃 -灊 -灑 -灘 -灝 -灤 -灧 -灴 -災 -炁 -炆 -炘 -炟 -炤 -炱 -炲 -炷 -炻 -烉 -烋 -烒 -烔 -烝 -烱 -烴 -焃 -焄 -焌 -焓 -焜 -焞 -焴 -焻 -焿 -煇 -煉 -煐 -煒 -煔 -煕 -煚 -煠 -煩 -煬 -煳 -煵 -煶 -熅 -熇 -熈 -熒 -熖 -熗 -熜 -熤 -熯 -熲 -熺 -熼 -熾 -熿 -燄 -燉 -燊 -燏 -燐 -燔 -燝 -燫 -燬 -燭 -燹 -燻 -燼 -燾 -燿 -爀 -爌 -爔 -爚 -爛 -爝 -爿 -牁 -牂 -牆 -牕 -牖 -牘 -牝 -牠 -牻 -牼 -牽 -犂 -犎 -犖 -犛 -犢 -犧 -犨 -犰 -犴 -犽 -狎 -狓 -狛 -狟 -狦 -狨 -狳 -狶 -狷 -狹 -狻 -猁 -猄 -猇 -猊 -猙 -猞 -猢 -猨 -猳 -猶 -猺 -猻 -獁 -獃 -獄 -獇 -獎 -獏 -獢 -獣 -獬 -獮 -獯 -獰 -獵 -獷 -獺 -獼 -獾 -玀 -玆 -玎 -玏 -玓 -玕 -玗 -玘 -玙 -玠 -玡 -玢 -玧 -玨 -玭 -玶 -玹 -玾 -珅 -珌 -珎 -珖 -珝 -珡 -珤 -珦 -珧 -珪 -珮 -珵 -珹 -珽 -琁 -琄 -琇 -琍 -琎 -琡 -琤 -琱 -琹 -琺 -琿 -瑀 -瑂 -瑆 -瑈 -瑊 -瑋 -瑑 -瑒 -瑝 -瑠 -瑢 -瑣 -瑤 -瑥 -瑧 -瑨 -瑯 -瑱 -瑳 -瑴 -瑺 -璄 -璆 -璉 -璌 -璕 -璘 -璙 -璚 -璠 -璡 -璣 -璥 -璦 -璪 -璫 -璬 -璮 -璱 -璵 -璸 -璹 -璽 -璿 -瓈 -瓊 -瓌 -瓏 -瓑 -瓔 -瓖 -瓘 -瓚 -瓛 -瓞 -甂 -甌 -甍 -甑 -甕 -甡 -甦 -甪 -畀 -畇 -畊 -畋 -畎 -畑 -畝 -畠 -畢 -畧 -畬 -畯 -異 -畳 -畷 -疇 -疊 -疋 -疍 -疒 -疕 -痍 -痙 -痟 -痩 -痲 -痺 -瘍 -瘓 -瘜 -瘞 -瘡 -瘧 -瘰 -瘺 -癀 -癆 -癇 -癒 -癘 -癟 -癡 -癢 -癤 -癥 -癩 -癬 -癭 -癮 -癯 -癰 -癱 -癲 -発 -皐 -皚 -皛 -皝 -皞 -皰 -皷 -皸 -盃 -盋 -盌 -盞 -盡 -監 -盦 -盧 -盨 -盩 -盪 -盫 -盷 -盺 -眀 -県 -眛 -眜 -眥 -眵 -眾 -睜 -睞 -睥 -睪 -睭 -睺 -瞋 -瞞 -瞢 -瞫 -瞼 -瞽 -矇 -矍 -矚 -矧 -矯 -砢 -砩 -砫 -砮 -砯 -砲 -砳 -砵 -硃 -硇 -硏 -硐 -硓 -硜 -硤 -硨 -硭 -硯 -碕 -碡 -碪 -碭 -碸 -碻 -碽 -磔 -磘 -磙 -磜 -磡 -磪 -磯 -磱 -磲 -磵 -磻 -磾 -礄 -礎 -礐 -礑 -礒 -礙 -礠 -礦 -礪 -礫 -礬 -礮 -礱 -礽 -祂 -祆 -祇 -祋 -祏 -祓 -祕 -祧 -祹 -祿 -禃 -禇 -禍 -禎 -禑 -禓 -禔 -禕 -禘 -禛 -禟 -禠 -禤 -禨 -禩 -禰 -禱 -禵 -禼 -禿 -秈 -秠 -秳 -稅 -稈 -稉 -稑 -稘 -稙 -稜 -稟 -稱 -稲 -稺 -稾 -穀 -穈 -穉 -穌 -積 -穎 -穟 -穠 -穡 -穢 -穣 -穫 -窅 -窋 -窣 -窩 -窪 -窮 -窯 -窰 -窶 -窺 -竄 -竅 -竇 -竈 -竊 -竑 -竜 -竦 -竩 -竻 -笄 -笘 -笞 -笥 -笩 -笪 -笭 -笮 -笯 -笱 -笳 -笹 -筅 -筊 -筌 -筍 -筘 -筥 -筦 -筧 -筬 -筭 -筲 -筳 -筶 -筻 -箆 -箇 -箋 -箏 -箑 -箒 -箜 -範 -篊 -篋 -篌 -篔 -篠 -篤 -篥 -篩 -篭 -篯 -篳 -簀 -簃 -簉 -簍 -簑 -簕 -簗 -簞 -簠 -簫 -簷 -簹 -簺 -簽 -簾 -籀 -籌 -籐 -籙 -籛 -籜 -籝 -籟 -籠 -籥 -籪 -籬 -籮 -籲 -籾 -粄 -粍 -粦 -粩 -糀 -糌 -糎 -糞 -糢 -糧 -糬 -糰 -糴 -糶 -糸 -糹 -糺 -糾 -紂 -紆 -紇 -紈 -紉 -紐 -紑 -紓 -紕 -紘 -紙 -紛 -紜 -紝 -紞 -紮 -紱 -紲 -紳 -紵 -紺 -紿 -絃 -絆 -経 -絎 -絕 -絛 -絜 -絞 -絢 -絨 -絪 -絳 -絵 -絹 -絺 -綃 -綈 -綎 -綏 -綖 -継 -続 -綜 -綝 -綞 -綠 -綢 -綣 -綧 -綬 -綮 -綰 -綳 -綴 -綸 -綺 -綻 -綽 -綾 -綿 -緁 -緃 -緄 -緈 -緊 -緋 -総 -緒 -緘 -緜 -緝 -緞 -締 -緡 -緤 -編 -緩 -緬 -緱 -緲 -練 -縂 -縄 -縈 -縉 -縊 -縕 -縛 -縝 -縞 -縠 -縡 -縯 -縱 -縴 -縵 -縷 -縹 -縻 -績 -繃 -繆 -繇 -繒 -繕 -繖 -繙 -繚 -繞 -繩 -繫 -繭 -繰 -繳 -繻 -繼 -繽 -繾 -纁 -纈 -纍 -纏 -纓 -纔 -纕 -纖 -纘 -纜 -缶 -缽 -罃 -罅 -罈 -罉 -罌 -罍 -罟 -罨 -罰 -罳 -罵 -罶 -罷 -罽 -羂 -羆 -羈 -羋 -羕 -羗 -羣 -羥 -羨 -羱 -翀 -翂 -翃 -翕 -翙 -翜 -翬 -翮 -翹 -耎 -耔 -耨 -耬 -聃 -聒 -聟 -聰 -聱 -聳 -聴 -聶 -聽 -聾 -肅 -肏 -肜 -肫 -肸 -肹 -胂 -胅 -胇 -胊 -胙 -胝 -胼 -脅 -脇 -脈 -脛 -脣 -脩 -脫 -脬 -脭 -脳 -脷 -脹 -腧 -腫 -腳 -膂 -膣 -膥 -膩 -膮 -膽 -膾 -膿 -臉 -臍 -臏 -臚 -臞 -臟 -臠 -臯 -舂 -舉 -舎 -舘 -舢 -舥 -舨 -舩 -舲 -舺 -艅 -艉 -艋 -艎 -艏 -艔 -艙 -艚 -艱 -艸 -艽 -芑 -芛 -芨 -芴 -芻 -苅 -苤 -苧 -苳 -苺 -苻 -苾 -茀 -茇 -茈 -茘 -茚 -茛 -茝 -茮 -茲 -茷 -茺 -荅 -荇 -荊 -荎 -荖 -荳 -莕 -莖 -莙 -莛 -莢 -莧 -莩 -莿 -菈 -菉 -菍 -菑 -菔 -菝 -菥 -菫 -菰 -菴 -菶 -菸 -菹 -菺 -菼 -菾 -萇 -萐 -萠 -萡 -萣 -萩 -萵 -萹 -葃 -葊 -葎 -葙 -葜 -葝 -葦 -葯 -葰 -葶 -葷 -蒍 -蒎 -蒐 -蒓 -蒔 -蒗 -蒞 -蒢 -蒧 -蒨 -蒭 -蒯 -蒴 -蒹 -蒺 -蒼 -蒾 -蓀 -蓁 -蓂 -蓆 -蓍 -蓘 -蓚 -蓧 -蓨 -蓪 -蓭 -蓯 -蓳 -蓽 -蔆 -蔎 -蔔 -蔕 -蔘 -蔝 -蔞 -蔣 -蔥 -蔦 -蔭 -蔴 -蔵 -蕁 -蕅 -蕎 -蕑 -蕖 -蕘 -蕚 -蕡 -蕢 -蕩 -蕪 -蕭 -蕷 -蕺 -蕻 -薀 -薆 -薊 -薌 -薐 -薑 -薔 -薗 -薘 -薙 -薜 -薞 -薟 -薨 -薫 -薲 -薷 -薸 -薺 -薾 -薿 -藎 -藟 -藦 -藨 -藪 -藶 -藸 -藹 -藺 -蘂 -蘄 -蘅 -蘊 -蘋 -蘐 -蘓 -蘗 -蘘 -蘚 -蘞 -蘢 -蘧 -蘩 -蘵 -蘶 -蘿 -虉 -虓 -虖 -虛 -虜 -虧 -虨 -虯 -虵 -虺 -蚆 -蚋 -蚍 -蚖 -蚡 -蚢 -蚵 -蚺 -蚼 -蛄 -蛉 -蛍 -蛑 -蛞 -蛯 -蛸 -蛺 -蛻 -蜆 -蜉 -蜑 -蜞 -蜢 -蜣 -蜨 -蜮 -蜯 -蜾 -蝀 -蝍 -蝓 -蝕 -蝘 -蝚 -蝟 -蝣 -蝤 -蝦 -蝨 -蝮 -蝯 -蝰 -蝲 -蝸 -螄 -螅 -螋 -螐 -螔 -螞 -螠 -螢 -螣 -螥 -螫 -螭 -螶 -螻 -螽 -螾 -蟄 -蟅 -蟊 -蟌 -蟎 -蟜 -蟥 -蟪 -蟫 -蟬 -蟯 -蟲 -蟳 -蟴 -蟶 -蟻 -蠂 -蠃 -蠅 -蠆 -蠊 -蠋 -蠍 -蠐 -蠑 -蠘 -蠙 -蠟 -蠣 -蠱 -蠲 -蠵 -蠶 -蠷 -蠻 -衂 -衎 -衕 -衚 -衜 -衝 -衞 -衽 -袓 -袛 -袞 -袴 -袾 -裊 -裎 -裒 -裖 -裬 -裵 -裾 -裿 -褌 -褍 -褎 -褘 -褙 -褞 -褧 -褫 -褭 -褸 -褻 -襌 -襖 -襞 -襠 -襤 -襦 -襪 -襯 -襲 -襴 -襶 -襻 -襾 -覇 -覈 -規 -覓 -覚 -覡 -覦 -覧 -覬 -覲 -観 -覽 -覿 -觔 -觙 -觚 -觜 -觭 -觱 -觴 -觶 -觿 -訁 -訃 -訇 -訌 -討 -訏 -訐 -訒 -訔 -訕 -訖 -託 -訛 -訝 -訟 -訥 -訴 -訶 -註 -証 -詁 -詆 -詈 -詐 -詒 -詔 -評 -詛 -詞 -詠 -詡 -詣 -詥 -詧 -詫 -詭 -詮 -詰 -詳 -詵 -詼 -誄 -誅 -誇 -認 -誒 -誕 -誡 -誣 -誤 -誥 -誦 -誨 -說 -読 -誰 -誴 -誹 -誼 -誾 -談 -諍 -諏 -諒 -論 -諗 -諜 -諟 -諠 -諡 -諤 -諦 -諧 -諪 -諫 -諭 -諮 -諱 -諲 -諳 -諴 -諶 -諷 -諸 -諺 -諼 -謀 -謁 -謂 -謄 -謊 -謌 -謎 -謏 -謐 -謔 -謖 -謗 -謙 -謚 -謜 -謠 -謤 -謨 -謩 -謫 -謬 -謳 -謾 -譏 -譓 -譔 -譙 -譚 -譞 -譫 -譭 -譯 -議 -譲 -譳 -譴 -譽 -譿 -讃 -讌 -讎 -讓 -讖 -讙 -讚 -讜 -讞 -谿 -豈 -豎 -豔 -豢 -豨 -豬 -豳 -豸 -豿 -貐 -貒 -貓 -貘 -貞 -貤 -貧 -貪 -貫 -責 -貮 -貯 -貲 -貳 -貶 -貸 -貺 -貽 -賁 -賂 -賃 -賄 -賈 -賊 -賑 -賒 -賔 -賕 -賚 -賞 -賡 -賤 -賦 -賨 -賬 -賭 -賹 -賺 -賻 -賽 -賾 -贄 -贅 -贇 -贊 -贌 -贍 -贓 -贔 -贖 -贛 -赧 -赬 -趐 -趕 -趖 -趨 -趺 -趼 -跅 -跏 -跗 -跡 -跣 -跩 -踎 -踐 -踰 -踴 -蹕 -蹟 -蹠 -蹤 -蹯 -蹺 -蹻 -躂 -躄 -躉 -躋 -躍 -躑 -躒 -躔 -躝 -躪 -躰 -軀 -軋 -軔 -軛 -軟 -転 -軫 -軲 -軸 -軹 -軺 -軻 -軼 -軾 -較 -輄 -輅 -輋 -輒 -輓 -輔 -輛 -輞 -輟 -輥 -輦 -輩 -輬 -輭 -輶 -輻 -輾 -輿 -轀 -轂 -轄 -轅 -轆 -轍 -轎 -轘 -轝 -轟 -轤 -辭 -辮 -辯 -辵 -辺 -辻 -込 -迴 -迵 -迺 -逈 -逋 -逌 -逎 -逕 -逖 -逤 -逨 -逴 -遄 -遊 -違 -遘 -遙 -遜 -遞 -遯 -遲 -遶 -遷 -遹 -遺 -遼 -邁 -邇 -邉 -邊 -邙 -邠 -邲 -邽 -邾 -郃 -郄 -郇 -郋 -郞 -郟 -郤 -郪 -郳 -郷 -郿 -鄃 -鄆 -鄋 -鄑 -鄒 -鄔 -鄖 -鄗 -鄘 -鄚 -鄜 -鄠 -鄤 -鄧 -鄩 -鄫 -鄰 -鄲 -鄳 -鄴 -酃 -酆 -酈 -酎 -酏 -酔 -酢 -酩 -酴 -酺 -酼 -醁 -醂 -醃 -醅 -醞 -醢 -醣 -醮 -醯 -醾 -醿 -釁 -釆 -釋 -釐 -釒 -釓 -釔 -釕 -釗 -釘 -釙 -釚 -釤 -釦 -釧 -釩 -釪 -釭 -釴 -釵 -釷 -釹 -釺 -鈀 -鈁 -鈄 -鈇 -鈈 -鈉 -鈊 -鈍 -鈏 -鈐 -鈑 -鈔 -鈕 -鈖 -鈞 -鈢 -鈣 -鈥 -鈦 -鈫 -鈮 -鈰 -鈳 -鈷 -鈸 -鈹 -鈾 -鈿 -鉀 -鉆 -鉈 -鉉 -鉋 -鉍 -鉏 -鉑 -鉓 -鉗 -鉚 -鉛 -鉞 -鉟 -鉤 -鉦 -鉬 -鉭 -鉲 -鉶 -鉷 -鉸 -鉻 -鉾 -鉿 -銂 -銃 -銅 -銋 -銍 -銓 -銕 -銖 -銚 -銜 -銠 -銣 -銥 -銦 -銨 -銩 -銪 -銫 -銬 -銱 -銲 -銳 -銶 -銹 -銻 -銼 -銾 -鋁 -鋅 -鋆 -鋇 -鋌 -鋏 -鋐 -鋒 -鋕 -鋗 -鋙 -鋡 -鋤 -鋥 -鋦 -鋨 -鋮 -鋯 -鋰 -鋱 -鋳 -鋶 -鋸 -鋹 -錀 -錏 -錐 -錒 -錕 -錘 -錚 -錞 -錟 -錠 -錡 -錢 -錨 -錫 -錬 -錮 -錳 -錸 -錻 -鍀 -鍇 -鍈 -鍉 -鍍 -鍏 -鍔 -鍘 -鍛 -鍝 -鍟 -鍠 -鍥 -鍩 -鍬 -鍱 -鍳 -鍶 -鍷 -鍺 -鍼 -鍾 -鎂 -鎅 -鎊 -鎌 -鎓 -鎔 -鎗 -鎘 -鎚 -鎛 -鎣 -鎦 -鎧 -鎪 -鎬 -鎭 -鎰 -鎳 -鎵 -鏃 -鏇 -鏈 -鏊 -鏌 -鏐 -鏑 -鏓 -鏗 -鏘 -鏝 -鏞 -鏟 -鏤 -鏦 -鏳 -鏴 -鏵 -鏷 -鏻 -鏽 -鐃 -鐇 -鐈 -鐓 -鐔 -鐙 -鐠 -鐤 -鐦 -鐧 -鐫 -鐬 -鐭 -鐮 -鐲 -鐳 -鐸 -鐺 -鐽 -鐿 -鑀 -鑁 -鑂 -鑄 -鑅 -鑊 -鑌 -鑑 -鑛 -鑠 -鑣 -鑨 -鑪 -鑭 -鑰 -鑲 -鑴 -鑷 -鑼 -鑾 -鑿 -閂 -閆 -閉 -閎 -閒 -閔 -閘 -閜 -閞 -閦 -閨 -閬 -閭 -閰 -閱 -閶 -閹 -閻 -閼 -閾 -閿 -闆 -闇 -闈 -闊 -闋 -闌 -闍 -闐 -闓 -闔 -闕 -闖 -闘 -闞 -闡 -闢 -闥 -阭 -阯 -陁 -陔 -陘 -陜 -陝 -陞 -陬 -陸 -険 -隄 -隈 -隊 -階 -隕 -隣 -險 -隰 -隱 -隲 -隳 -隴 -隷 -隸 -隻 -雋 -雑 -雖 -雛 -雝 -雩 -雫 -雱 -霅 -霈 -霊 -霑 -霙 -霤 -霧 -霨 -霶 -霽 -靁 -靂 -靄 -靉 -靚 -靫 -靬 -靭 -靺 -靼 -鞆 -鞏 -鞞 -鞥 -鞦 -鞨 -鞮 -鞴 -韁 -韃 -韆 -韋 -韌 -韑 -韙 -韜 -韞 -韠 -韡 -韮 -韺 -韾 -頁 -頃 -項 -須 -頊 -頌 -頍 -頎 -頏 -預 -頑 -頒 -頗 -領 -頜 -頠 -頦 -頫 -頴 -頵 -頷 -頸 -頹 -頻 -頼 -顆 -額 -顎 -顒 -顓 -顕 -顗 -願 -顙 -顛 -顥 -顫 -顰 -顱 -顳 -顴 -颮 -颯 -颱 -颶 -颺 -颼 -飆 -飈 -飠 -飡 -飢 -飥 -飩 -飪 -飫 -飭 -飴 -飽 -餃 -餄 -餉 -餌 -餎 -餒 -餓 -餗 -餚 -餛 -餞 -餡 -餵 -餺 -餾 -餿 -饋 -饌 -饑 -饒 -饗 -饞 -饟 -饢 -馘 -馛 -馦 -馭 -馯 -馱 -馳 -馼 -駁 -駄 -駅 -駆 -駐 -駑 -駒 -駔 -駘 -駙 -駛 -駝 -駟 -駢 -駭 -駰 -駱 -騁 -騂 -騄 -騅 -騋 -騎 -騏 -験 -騖 -騙 -騤 -騨 -騫 -騭 -騮 -騶 -騾 -驁 -驃 -驄 -驅 -驌 -驍 -驎 -驒 -驕 -驚 -驛 -驟 -驢 -驤 -驥 -驩 -驪 -骯 -髀 -髎 -髏 -髑 -髒 -髡 -髭 -髲 -髷 -髹 -鬄 -鬅 -鬆 -鬍 -鬚 -鬢 -鬥 -鬧 -鬨 -鬩 -鬪 -鬬 -鬮 -鬯 -鬱 -鬹 -鬻 -魃 -魈 -魋 -魍 -魎 -魕 -魘 -魛 -魞 -魟 -魣 -魨 -魩 -魮 -魯 -魴 -魷 -鮀 -鮁 -鮃 -鮄 -鮊 -鮋 -鮍 -鮐 -鮑 -鮒 -鮓 -鮗 -鮜 -鮟 -鮠 -鮡 -鮣 -鮨 -鮪 -鮫 -鮭 -鮰 -鮸 -鮹 -鮻 -鯀 -鯁 -鯃 -鯇 -鯊 -鯏 -鯒 -鯓 -鯔 -鯕 -鯖 -鯗 -鯙 -鯛 -鯡 -鯢 -鯤 -鯧 -鯪 -鯭 -鯮 -鯰 -鯶 -鯷 -鯻 -鯿 -鰂 -鰃 -鰆 -鰈 -鰉 -鰍 -鰏 -鰒 -鰓 -鰕 -鰗 -鰛 -鰜 -鰟 -鰣 -鰤 -鰧 -鰨 -鰩 -鰭 -鰮 -鰱 -鰲 -鰳 -鰶 -鰷 -鰹 -鰺 -鰻 -鰼 -鰾 -鱀 -鱂 -鱅 -鱇 -鱈 -鱉 -鱊 -鱒 -鱓 -鱔 -鱖 -鱗 -鱘 -鱚 -鱝 -鱟 -鱠 -鱣 -鱥 -鱧 -鱨 -鱮 -鱰 -鱲 -鱵 -鱷 -鱺 -鳧 -鳩 -鳰 -鳴 -鳶 -鳽 -鴆 -鴇 -鴉 -鴒 -鴓 -鴕 -鴗 -鴛 -鴝 -鴞 -鴟 -鴡 -鴣 -鴦 -鴫 -鴯 -鴰 -鴴 -鵂 -鵄 -鵎 -鵐 -鵑 -鵒 -鵓 -鵙 -鵜 -鵝 -鵞 -鵟 -鵠 -鵡 -鵪 -鵯 -鵰 -鵲 -鵵 -鵼 -鵾 -鶆 -鶇 -鶉 -鶒 -鶓 -鶘 -鶚 -鶡 -鶥 -鶩 -鶬 -鶯 -鶲 -鶹 -鶺 -鶻 -鶼 -鶿 -鷂 -鷉 -鷎 -鷓 -鷙 -鷚 -鷟 -鷥 -鷦 -鷫 -鷯 -鷲 -鷳 -鷸 -鸊 -鸌 -鸐 -鸑 -鸕 -鸘 -鸚 -鸛 -鸜 -鸝 -鸞 -鹮 -鹵 -鹹 -鹼 -麅 -麇 -麈 -麊 -麐 -麞 -麩 -麪 -麴 -麹 -麼 -麿 -黁 -黇 -黌 -黐 -黙 -黥 -黧 -黨 -黴 -黶 -黻 -黼 -黽 -黿 -鼂 -鼇 -鼈 -鼉 -鼐 -鼒 -鼕 -鼢 -鼩 -鼯 -鼱 -鼴 -鼷 -齒 -齕 -齡 -齣 -齦 -齧 -齲 -齶 -龎 -龐 -龑 -龔 -龕 -龜 -龝 -龠 -ず -梌 -叀 -晢 -媸 -錾 -鐖 -䰡 -櫬 -锱 -υ -鼗 -媪 -澴 -苈 -眴 -𝜏 -缱 -𝜶 -조 -晡 -≡ -ࠀ -н -廇 -嗛 -篚 -ώ -莰 -윤 -纚 -𢢞 -闼 -熌 -饎 -蓊 -倅 -년 -聭 -耩 -≅ - -≺ -诌 - - - -耰 -菗 -僦 -⇣ -甊 -冓 -缷 -枊 -沕 -𝐴 -❹ -형 -秾 - -щ -厹 - -˗ -疔 -䩦 -髴 -⨂ -莏 -≧ -垆 -銌 -桤 -隤 -ギ -벽 -⑸ -✘ -̣ -辶 -铼 -게 -へ -獶 -藳 -祍 -黉 -跱 -⽬ -埙 -だ -蓣 -亯 -구 - -鹎 - -⾃ -楩 -⌘ -汏 -虒 -谖 - -﹜ -劖 -じ -瑇 -㮑 -揕 -⇔ -𤔲 -薉 -𝑾 -硗 -〈 -は -盍 -狽 -ж -я -挆 -槨 -γ -阏 -襕 -𝜉 -❖ -└ -총 -시 - -ν -刲 -ด -嬲 -绤 -𝐰 -飦 -扱 -帻 -辀 -廴 -к -蔖 -– -같 -熭 -巣 - -裛 -𝑶 -蓺 -蔊 -그 -匳 -玚 -Ц -璲 -련 -𨒅 -변 -㤵 -饫 -𨚵 -X -筇 -镡 -ⅳ -𝛿 -轸 -𝑭 -鋈 -鵩 -縁 -˙ -ɿ -𝒴 -㝮 -𝜂 -栠 -橦 -緇 -肰 - -跼 -䭜 -蜅 -訸 -㻶 -𝑉 -เ - -嚢 -鼔 -𝒆 -閫 -阃 -𥞹 -杪 -誊 - -鲋 -骍 -τ -莾 -凊 -﹡ -箚 -蛱 -樯 -喾 -幞 -欕 -搡 -戉 -瘖 -᙭ -砟 -ས -∤ -ี -メ -𝝁 -穑 -渶 -𦬁 -서 -⊗ -穇 -⌊ -を -鐻 -蘤 -≫ -◐ -汙 -蒒 -⑷ -蹨 -x -裥 -嶤 -ァ -従 -침 -稂 -𪧶 -で -𝑹 -⑫ -闩 -槫 -舮 -𝑿 -戁 -간 -戯 - -ོ -æ -わ -チ -砉 -Ψ -劂 -・ -В -鬭 -钔 -盭 -黓 -⎯ -𝐏 -함 -钪 -𝑸 -澰 -래 -藒 -龃 -瞀 -伧 -♂ -¹ -ƞ -澼 -餍 -倶 -ð - -嚱 -跬 -貙 -磿 -娬 -氿 -鹘 -𝐁 -摅 -ヱ -傰 -พ -湝 -ˆ -Л -翾 -≃ -에 -滫 -С -嫕 -あ -㈣ -ⅇ -垧 -⺮ -∠ -躐 -硌 -眢 -乧 -𝑐 -泃 -轫 -↔ -㎝ -≜ -⽇ -撟 -⟹ -脿 - -㸁 -靯 -う -⁠ -懬 -搷 -瀓 -ˁ -ⅲ -훈 - -お -𝛄 -瓅 -葻 -猋 -ら -⾳ -喣 -⽿ -č -鈎 -⑤ -å -阸 -름 -て -圮 -⚫ -⻄ -胨 -琠 -戄 -箄 -𝒳 -鼍 -й -⼲ -廪 -睃 -囫 -͞ -죄 - -호 -み -饩 - -⊆ -х -欚 -瘚 -≯ -瞗 -ž -嗵 -근 -ま - -⾔ -罥 -ʹ -鼃 -д -✳ -ゃ -悊 -𝐅 -영 -@ -ɣ -𝛷 -𝜁 -ǜ -犄 -⽂ -ཆ -胒 -﹦ -谫 -є -・ -𝐻 -狺 -백 -舳 -𝑁 -ษ - -𝜓 -𝒦 -盕 -유 -𪯐 -茑 -礤 -거 -コ -肂 -鸻 -ã -⑬ -铚 -걸 -磳 -綷 -𝒚 -舭 -腚 -㈩ -榱 -𝐌 -畾 -馐 -罾 -∕ -𝔛 -𝑬 -ç -楬 -櫽 -顼 -阋 - -꺼 -諛 -̌ -้ -㮀 -乵 -沬 -⼀ -ư -鲠 -䜩 -樉 -鹈 -搧 -轾 -䟒 -등 -𝝉 -잠 -짤 -า -蘨 -愪 -ྟ -慪 -鮝 -𝛑 -び -𥞪 -𝐾 -レ -교 -ྲ -달 -𝐩 - -殹 -踇 -狥 -ベ -미 -매 -⑭ -钁 -Θ -못 -𝜇 -侂 -ę -ฟ -邶 -諣 -颃 -𡢕 -昑 -𝒖 -讱 -﹤ -緵 -骢 -朢 -骘 -ℜ - -ゞ -愬 -鹬 - -ッ -ར -급 -‚ -鸶 -蒫 -餽 -蓃 -ข -辠 -ğ -氺 -暆 -笿 -迚 -甝 -ή -徼 -旣 -ϖ -ヲ -倕 -匽 -蓱 -리 -剷 -ู -逪 - -나 -堋 -焠 -Δ -炑 -爫 -蒖 -𝒓 -悫 -𝛱 - -𝐮 -騧 -ⅴ -饾 -贠 -𝚲 -崀 -磀 -柤 -肈 -⻮ -鶄 -狲 -跫 -지 -鳇 -痖 -跂 -秫 -ʒ -합 -ไ -迨 -𝜐 - -屦 -𝐶 -; -辎 -∵ -鴁 -撏 -ς -⟶ -薮 -㟪 -犮 -ب -ビ -藡 -甏 - -眡 -訿 -鉥 -媵 - -柫 -𝒞 -ь -萏 -ค -트 -訮 -汚 -眚 -〞 -き -ほ -刖 -髄 -蘀 -や -ة -诹 -т -ན -𝒃 -掼 -䓁 -僥 -팰 -枵 -✔ -³ -ེ -鼖 -屖 -鍮 -砇 -カ -舐 -牴 -𝜎 -㡿 -攉 -⽤ -晅 -労 -蛕 -𝐽 -Ʃ -く -穽 -孥 -𝒏 - - -ɬ -玦 -檮 -ョ -∥ -중 -萯 -呲 -䰈 -새 - -釶 - -ɢ -⊂ -臮 - -梼 -デ -骖 -ス -蹩 -羼 -▽ -Π -≪ -匛 -𝐼 -稊 -่ -茠 -䢉 -秝 -茐 -齎 -そ - -芕 -噚 -癉 -蹱 -蓜 -𝐬 -ϑ -е -瀋 -ϕ -χ -镟 -霂 -隒 -▱ -ヶ -撄 -둔 -¢ -こ -跲 -莻 -𝑠 -輮 -็ -堠 -푟 -赕 -◦ -ا -런 -帒 -汘 -̱ -尥 -蘠 -𦟜 -옥 -腠 -夨 -⩾ -𝑝 -歯 -刱 -여 -け -溘 -釰 -肍 -擗 -矱 -鍌 -芧 -술 -발 -鼫 -舾 -⼯ -𝝓 -ƒ -怸 -པ -𣐼 -疎 -铷 -Η -⑺ -蒏 -림 -⃛ -゜ -褴 -𨒪 -れ -揢 -さ -櫫 -櫑 -䋎 -灋 -櫜 -诓 -❶ -𝐃 -Q -袳 -ℒ -菂 - -荙 -ℛ -⁄ -堙 -贋 -̅ -鳏 -̂ -、 -茍 -泜 -𝑈 -즉 -噔 - - -迓 -Ⅸ -❷ -이 -_ -⾊ -Ö -铥 -耹 -䶮 - -무 - -セ -饳 -อ -篾 -통 -‒ -ย -덕 -말 -艨 -Ω -𝐨 -螓 -澐 -巠 -⋅ -钶 -도 -鸱 -齍 -恑 -褛 -剟 -준 -勶 -𠟠 -ß -箅 -𝑆 -悃 -蘥 - -Ξ -𝑘 -妣 -𝑖 -𝐑 -纡 -釿 -⺌ -ヴ -𝕀 -涻 -箙 -塚 - -⼠ -墈 -∷ -疴 -ク -ㄕ -𝒂 -蒪 -蓡 - -鷇 -瘏 -𣹳 -橰 -嵚 -帀 -주 -ド -盓 -爇 -φ -觋 -𝜑 -钍 -화 -표 -Ɛ -篰 -명 -週 -с -蓛 -裢 -穜 -㱃 -玊 -鲕 -蒕 -箪 -⑯ -苽 -矦 -偰 -盝 -佊 -僨 -駉 -𝑳 -머 -ª -絅 - -锒 - -苆 -ั -𝛻 -碹 -咺 -竝 -и -づ -강 -辁 -́ -铽 -纩 -齑 -𝝎 -어 -ユ -躡 -𝒄 -ซ -畛 -鸰 -ླ - -骉 -❸ -揲 -廃 -湋 -𝑲 - - -旤 - -蹷 -钌 -국 -豙 -鬳 - -ɛ -轳 -俜 -眄 -萮 -𝐡 -颵 -箓 -魑 -𝑅 -漍 -ℤ - -逡 -학 -浖 -ょ -¬ -怴 -𝛤 -怿 -祌 -纥 -𝒑 -⃑ -棅 -笵 -낭 -栦 -⑰ -บ -𝔽 -𝑇 -埝 -⽓ -孱 -埶 -匜 -鸼 - - -벌 -ル -锸 -斫 -妟 -뽀 -昬 -댁 -ʂ -暯 -夳 -ノ -堞 -懘 -榼 -鞫 -오 -𝑡 -偑 -戗 -∴ -伥 -끝 -𬌗 -稯 -岜 -Ε -犲 -𩓞 -연 -鹚 - -ག -诜 -嗍 -倥 -鳣 -庑 -屾 -雚 - -椄 -颏 -酤 -𝒋 -欛 - -း -려 -缋 -¾ -ゴ -籑 -笤 -鞛 -鏺 -蓒 -설 -緍 -⑩ -迀 -鼋 -ɮ -위 -锪 -∨ -滆 -€ -躅 -鋓 -柀 -䐶 -啎 -𝛵 -骃 -ć -갈 -卨 -い -𝑺 -鸲 -壻 -偯 -𝑞 -譖 -곤 -溍 - -噫 -순 - -𝑽 -ы -赑 -蓸 -鸮 -稃 -っ -詗 -으 -⨀ -屮 -俦 -伛 -畱 -늬 -𝑂 -朼 -沰 -겨 -з -骀 -鸩 -𝜈 -º -苊 -诎 -皤 - -하 -̀ -砑 -凷 -翄 -𝑛 -赪 -≮ -浗 -𝐍 -û -オ -ƹ -𝜅 -묘 -曛 -鳊 -𝛩 -癹 -磒 -ば -⑨ -礆 - -乼 - -∽ -褱 -藴 -縶 -觥 -に -식 -凫 - -佥 -槷 -阍 -䰍 -졸 -전 -葢 -㝸 -も -⻔ -遽 - -蹰 -𝛺 -裏 -། -를 - -ろ -짭 - -ぐ -싶 -渰 -⊤ -浳 -൯ -∃ -옛 -蟞 -과 -芠 -飖 -⼆ -敶 -粝 -𥃩 -坿 -䩉 -𝑯 -「 -矰 - -사 -𝛶 -𝑎 -挐 -푎 -동 -ℝ -Γ -︃ -珒 -鹍 -κ -鑓 -傁 -惓 -臿 -丣 -悒 -侔 -ñ -訳 -櫭 -賛 -觏 -辂 -覅 -濓 -堿 -擪 -฀ -𝑵 -扨 -嫫 -珰 - -寃 -𝒔 -曱 -髣 -인 -≌ -莵 -踳 -ⅱ -Ø -⌋ -¯ -挢 -̇ -﹪ -哕 -𦫳 - -襛 -昳 -铙 -铫 -軱 -汔 -ネ - -躩 -옷 -ถ -엄 -皊 -臑 -𧄝 -𝑃 - -䢅 -𝐝 -𝒍 -ℱ -𝐓 -蓾 -𝑻 -䋁 -裼 -개 -ത -𝒊 -僪 -瞂 -𦞠 -요 - ̄ -荍 -𝜔 -ф -峣 -庋 -檏 -袢 -绬 -Σ -향 -钫 -え -枅 -≝ -荦 -들 -勍 -ö -𝒕 -툰 -遬 -𝐵 -擧 -咢 -钘 - -𝒢 -Ⅷ -➢ -讧 -ω -簟 -廐 -刳 -阘 -б -⊘ -髟 -臓 -루 -⎧ -诳 -у -诮 -蠪 -梹 -耤 -パ -ن -∆ - -𝑫 -น -べ -坼 - -𝑤 -褽 -憼 -심 -∇ -迖 -휆 -叚 -없 -⼿ -钖 -斠 -䪵 -胠 -𝜋 -殽 -剜 -⾝ -− -慸 -𝛽 -椔 -⟩ -皦 -筚 -奰 -Å -물 -𝒐 -嫱 -钆 -ï -∪ -⇢ -ş - -㖞 -璗 -葸 -殢 -𝜺 -夲 -骒 -ち -회 -선 -睒 -轡 -ξ - -鲧 -镞 -碜 -놈 -Å -紴 - -⇤ -ྷ -⑪ -喟 -𦼰 - -蔩 -埦 -𝜆 -耋 -˜ -한 -舣 -馓 -⑻ - -ɐ -椘 -し -莐 -辔 -憰 -碛 -⁃ -飏 -颀 -跽 -⇥ -赀 -撺 -襜 -ɒ -袧 -л -정 -꾸 -콩 - -박 -缑 -柈 - -樲 -𝑮 -詘 -µ -𝑷 -鹪 -𝛼 -차 -讬 -掯 -硎 -𝑨 -舄 -‹ -누 -バ -ก -萀 -兇 -숙 -貍 - -踈 -친 -𝜽 -摰 -甿 -坜 -遑 -삼 -배 -Μ -을 -譊 -沩 -빈 -飑 -钹 -镨 -鐉 -宭 -桗 -ə -歺 -А -⇒ -锞 -𝒪 -棊 -愓 -莶 -琲 -འ -プ -་ -𝐿 -艟 -欬 -ิ -в -ų -纻 -㎎ -婄 -Ρ -歟 -椢 -粜 -종 -خ -ね -剞 -베 -斄 -幠 -ト -疛 -よ -╳ -醳 -군 -諂 -芰 -穋 -禆 -길 -秊 -噙 -y -锓 -⁵ - -拠 -Ĥ -𝑒 -窬 -抈 -︰ -퐶 -铳 -각 -ش -錉 -ù -臝 -闶 -𝒟 -芐 -韎 -권 -萚 - -ど -羮 -镕 -欔 -瘐 -받 -𝑚 -𢦟 -髤 -腙 -⽽ -상 -铘 -장 -𤇈 -ニ -凂 -ȷ -육 -а -살 -雠 -荑 -태 -穤 -ɯ - -圬 -楑 -단 -ง -⾯ -λ -⁰ -성 -萿 -缌 - -毣 -矅 - -푚 -˘ -貣 -∂ -은 -ė -䌛 -경 -せ - -拫 -⅞ -餕 -鐨 -翭 -ื -ɵ -⺍ -Փ -▬ -ว -희 -짐 -屙 -洫 -ေ -∏ -臜 - -剸 -芓 -운 -∓ -계 -祔 -鶵 -𝝅 -柂 -訢 -禊 -扽 -恫 -𝜙 -狢 -勠 -ི -𝜒 -จ -铯 -ྱ -𝑙 -蟇 -울 -莤 -牱 -𝒗 -詇 -靃 -殓 -栍 -踟 -ي - -鲄 -㓷 -贳 -ナ -鲓 -𝒙 -薁 -Χ -侪 -恌 -㰤 -목 -̄ -丱 -― -𝛔 -𝑔 - -鸷 -﹣ -籢 -脢 -δ -窭 -‐ -阒 -석 -아 -ォ -두 -𝐦 -浬 -搰 -褃 - -ལ -乇 -腘 -眊 -偬 -Ⅻ -ℳ -畤 -芟 -曈 -飧 -堌 -═ -谶 -櫝 -嬑 -冋 -嗌 -抜 - -腜 -공 -𝜕 -ん -鲭 -郐 -酓 -𝑍 -⾏ -⼹ -㐬 -고 -𝟑 -缯 -碤 -濩 -ʰ -佻 -Υ -∗ -賅 -집 -跹 - -ɾ -蔧 -다 -栫 -庰 -欤 -洿 -捾 -𝜍 -𝑄 - -攆 -夂 -檿 -荜 -ц -柖 -唅 -ท -ɦ -讦 -습 -锿 - -纆 -檑 -殰 -歠 -鼑 -Ä -و -☑ -緦 -悁 -偞 -ส -絭 -저 -踯 -騀 -쉰 -蒷 -揗 -儵 -ρ -薅 -ャ -‗ -犒 -旟 - -승 -ང -소 -𝛴 -瀜 -锜 -𣱼 -谳 - -軑 -ポ -楁 -𝑜 -袚 -ྐ -Á -𝑑 -鲀 -牾 -鬌 -푥 -¤ -呴 -‑ -✓ -민 -⼦ -ⅰ -⽉ -擿 -ч -➝ -가 -≳ -漥 -踖 -枧 -莝 -⻘ -傧 -𝑢 -ю -赍 -q -絫 -о -ア -ℐ -髫 -齢 -湎 -甓 -揿 - -ℋ -怹 -자 -⑦ -져 -椟 -鶟 -浕 -ー -𝛂 -偾 -⃗ -喑 -鹡 -≦ -磽 -ⅆ - -葂 -鶱 -ン -貇 -褡 -▴ -것 -喿 -つ -闚 - -盳 -𝟒 -雔 -洭 -殫 -楎 -£ -^ -葲 -𝟖 -眗 -棸 -潏 -熕 -𝟐 -품 -သ -樳 -⁴ -イ -㈢ -렴 -脰 -돈 -⑮ -钲 -𝒘 -訽 -爟 -幨 -枻 -亷 -猃 -σ -黩 -嘑 -榹 -⁡ -鍧 -𝑋 -枘 -𝑥 -원 -睚 -饔 -酲 - -顸 -람 -𝐫 -脁 -½ -긴 -ʔ -Ⅰ -旆 -죽 - -궐 - -奡 -㭃 -杝 -忾 -ม -掮 -饍 -摛 -쓰 -慊 -踣 -푅 -悽 -礅 -毄 -俓 -데 -冣 -만 -驖 -𤉣 -̃ -廾 -匵 -阇 -𤸫 -戣 -刌 -剕 -杅 -ο -蒥 -ː -癃 -蒬 -â -À -嗥 -우 -ケ -」 -聩 -ë -吽 -檌 -苰 -⑹ -Ÿ - -⑥ -노 - -˄ -鸫 -廛 -㱿 -鹛 -赟 -℅ -菿 -辳 -阼 -𝒇 -哋 -♀ -氕 -砤 -† -舡 -偝 -飜 -넓 -鈜 -ầ -닝 -禚 -匲 -〉 -Ф -锊 -ϵ -∙ -惛 -箧 -𝑦 -宬 -𝑀 -臙 -𩡶 - -¡ -潀 -수 -敃 -か -أ - -苌 -饘 -咝 -𝑼 -∘ -涷 -樍 -厣 -蝼 -墻 -Ñ -秅 -︒ -∅ -↵ -葹 -ỽ -𝑗 -た -일 -蒊 -치 -竢 -¨ -佢 -潵 -櫼 -軵 -𧕿 -倨 -歱 -瘅 -𝐭 -黾 -脼 -ê -땋 -鶷 -ё -鹯 -掲 -\ -𨳡 - -Г -ι -탁 -溞 -殪 -菭 -𝛥 -擛 -録 -㥥 -∀ -锇 -锃 -편 -餬 -瘻 -ཟ -豤 -로 -ɸ -ℎ -랑 -ʃ -鼹 -臬 -ŋ -巵 -譁 -w -窳 -蓔 -䉜 -浧 -酂 -⒀ -མ -椠 -槖 - -衄 -𨐨 -̿ -ご -⺗ -顇 -𝒫 -搕 -ミ -𪪋 -« -䣛 -鹩 -鴈 -п -는 -䋰 -𝛹 -犕 -呌 -𝒮 -𝑪 -鋎 -嚻 -杚 -䕊 -蠜 -ザ -𝐂 -☐ -𥘔 - -赜 -Ν -廦 -瓾 -↦ -龉 -⽅ -棂 -𝜌 -큰 -踔 -ラ -。 -剤 -황 -⅜ -僈 -骈 -ɕ -О -м -椑 -𝑟 -纇 -𝑓 -𝐖 -Ш -⎦ -旹 -삶 -ึ -囵 - -す -ⅈ -ت -踽 -陴 -餱 -ป -막 -紟 -방 -剀 -簖 -闬 -キ -鋉 -납 -タ -谵 -詑 -족 -垔 -荋 -旰 -𥘸 -窾 - -신 -𝐎 -𝛃 - - -﹒ -縰 -猲 -郘 -파 -⊕ -镘 -𠊃 - -呔 -𝜗 -ʊ -𝛬 -辏 -陭 -𝑕 -庴 -ʐ -瀌 -倄 -蕞 -ل -絷 -러 -든 -བ -柅 -› -傚 -睠 -Ⅺ -饐 -蔮 -ɟ -莈 -𤨨 -⋯ -犭 -𝜃 -𥹳 -초 -⎞ -遌 -眇 -蓗 -綅 -토 -裰 - -⼼ -虘 -𝑌 -觯 -漶 -钤 -讒 -げ -螬 -鲹 -咷 -蓞 -僂 -𝒉 -が -桮 -포 -쟈 -柽 -ウ -綟 -缟 -䁾 -钭 -烕 -厠 - -孭 -礉 -­ -谲 - -⼤ -𝒒 -旒 -㫄 - - -鳓 -挈 -재 -ད -𧊒 -蝝 -𝐺 -懱 -芢 - -ⅼ -Ú -𝑱 -翯 -芶 -厽 -遉 -鲒 -η -𝛾 -趮 -虆 -汸 -嬖 -糈 -窸 - -추 -棬 -懔 -硁 -ぶ -抟 -胕 -𝑧 -⌦ -碫 -Ⅵ -속 -𝐚 - -Ç -행 -Ɵ -⑱ -贽 -箤 -р -敒 -衤 -풍 -⊛ -慉 -ψ -© -광 -ℕ -屣 -臌 -旵 -臁 -‡ -癎 -閡 -𡵂 -襐 -畟 - -萪 -娒 -瘼 -庳 -천 -觌 -Α -と -奁 -煃 -؟ -◯ -의 -攎 - -𝐞 -J -𢦒 -❑ -벤 -𝐒 -リ -蒉 -𝐱 -朹 -㈤ -„ -䗬 -Ι -ཀ -𡜵 -俣 -疬 - -墥 -吣 -У -榀 -絟 - -旓 -𝐛 -𝜷 -瑮 -≔ -笾 -ζ -김 -暵 -𝜹 -逶 -萙 -欇 -俧 -籴 -絰 -揶 -ǔ -宂 -伩 -Ө -菞 -梕 -エ -蹚 -제 -Š -沝 - -𦳢 -𝒱 -揠 -ℏ -𝐹 -箝 -규 -氒 -⼊ -鰌 -筮 -⼩ -대 -𝔾 -䄃 -𝐸 -﹕ -부 - -刓 -ȵ -缛 -기 -缊 -𝟎 -𨟻 -め -捃 -⽚ -鍪 -灆 -迻 -⾦ -荗 -v -𡒊 -汍 -斲 -姕 - -儴 -偒 -辤 -芀 -蝥 -ń -臥 -椀 -㪚 -š -담 -ø -䈰 -睏 -テ -﹐ - -참 -楘 -𝒌 -劓 -ɪ -醑 -绹 -諓 -𝛉 -ズ -怼 -埘 -臽 -잡 -镢 -𝜖 -진 -踬 -谠 -﹥ -髺 -腞 -현 -嘭 -ʑ -蓌 -〜 -锠 -蓶 -る - -∼ -枎 -緗 -薠 -芈 -耪 -𝒎 -謼 - -瘳 -诨 -忤 -œ -⇡ - -鲣 -ⅵ -Τ -㯮 - -㶲 -ⅹ -䙴 -坴 -馑 -缹 -椦 -ô -⼈ -フ -誏 -э -哙 -愎 - -埽 -祲 -마 -殍 -菋 -懑 - -辇 -鍤 -𝜀 -ɜ -り -𝐷 -㕞 -瑵 - -蔨 -Ⅶ -镴 -ภ -𝝀 -𢶡 -⃝ -ơ -柢 -𧴗 -ʁ -攙 - -な -𝑏 -挴 -餧 -絇 -怄 -曏 -洟 -軷 -∉ -咍 -⎪ -樛 -𝑣 -웃 -椊 -黢 -𝑩 -誩 -伓 -戠 -橥 -⟨ -豰 -懥 -涖 -綘 -詬 -ွ -˚ -刽 -ɑ -격 -稖 -𝟏 -禝 -墦 -𝑊 - -択 -檙 -∝ -颟 -诂 -𝐧 -踲 -𝜛 -𝑰 - -鲬 -⁸ -ǎ -문 - -紬 -楲 -䊭 -枨 -膑 -õ -던 -Ⅴ -秏 -馔 -拊 -缗 -隠 -牀 -淲 -鬰 -綵 -鶑 -詎 -慙 -劒 -閲 -鎡 -淒 -屨 -鉢 -扃 -鳬 -閤 -馿 -翛 -駸 -蛩 -驂 -嵲 -覩 -牋 -湲 -蓴 -賸 -遡 -翫 -嫋 -惻 -妬 -罇 -龏 -鵷 -閙 -鎸 -朅 -巉 -僞 -洶 -磧 -筯 -慇 -鷁 -斾 -斸 -酹 -幘 -羶 -閽 -薤 -泝 -覯 -怱 -缾 -氳 -躊 -檝 -擣 -虀 -誚 -甃 -艤 -櫳 -醖 -壚 -涴 -崦 -秪 -潄 -濆 -駡 -坰 -闉 -縑 -躕 -颻 -燠 -輙 -鏁 -嶮 -薖 -輈 -綆 -覷 -蹔 -攄 -鐩 -鑱 -羃 -轓 -麤 -驀 -欵 -亙 -朮 -邐 -箠 -怳 -鋩 -鷃 -篘 -蔌 -諄 -旐 -慍 -欷 -頽 -蜺 -覊 -禋 -秔 -蜩 -嚬 -櫺 -軿 -痾 -笻 -猱 -毳 -泬 -竚 -齪 -搘 -欻 -釂 -嚥 -誑 -籩 -韉 -幙 -舠 -飣 -颭 -颸 -歔 -屧 -巇 -艫 -壖 -牓 -薝 -銛 -皪 -枿 -剗 -歘 -鸂 -邅 -衒 -荄 -鴂 -嫗 -顦 -瀼 -繄 -搆 -狖 -貰 -醆 -秖 -蹀 -頳 -纒 -憇 -溽 -澦 -讐 -灩 -箎 -螿 -鰥 -飀 -澒 -矻 -枌 -擡 -鷖 -齬 -纊 -挼 -齟 -錙 -屩 -蠧 -氅 -漭 -躚 -翺 -瘵 -螘 -鵶 -㶁 -斵 -饜 -岏 -䍦 -哢 -彴 -豗 -靨 -鋋 -禳 -覘 -鞚 -擻 -涘 -溷 -沴 -嶓 -褊 -罏 -齏 -醲 -繅 -舴 -釃 -厖 -闤 -閴 -藂 -譍 -糲 -籞 -躞 -餳 -遰 -倐 -嘖 -鷀 -暍 -韝 -蘺 -齁 -醽 -醨 -憀 -䕷 -跕 -拶 -垤 -鸎 -漙 -躭 -傴 -蕕 -嘒 -晻 -糵 -閈 -嫠 -斁 -鶗 -詶 -囘 -羇 -橛 -鞬 -磈 -粃 -阨 -塿 -敧 -氊 -芼 -襆 -迍 -鬛 -憒 -釅 -蓐 -奬 -頲 -髠 -抔 -葅 -槧 -跧 -揜 -渇 -餔 -罝 -裯 -蹁 -椶 -幰 -逰 -饁 -棃 -噀 -轔 -囁 -惸 -騑 -呪 -鬐 -綌 -醤 -䆉 -艣 -鐶 -夐 -摐 -鸇 -睎 -羝 -紼 -鞿 -噉 -磑 -闗 -筩 -駮 -蹌 -縢 -闠 -鬙 -谹 -榾 -觳 -皁 -晼 -啗 -簣 -騕 -蹣 -麰 -觧 -怊 -朞 -鱍 -蟣 -畚 -蠁 -舁 -瞇 -劚 -鰐 -籯 -鬖 -柮 -飱 -帟 -酇 -崿 -霪 -緌 -踆 -欃 -縟 -搦 -琖 -搥 -倀 -觫 -遝 -嚅 -聵 -藋 -筴 -喁 -窻 -穏 -牥 -鎩 -礲 -膴 -鞵 -醵 -斚 -縲 -裀 -齷 -騃 -袠 -谺 -靦 -帬 -鯈 -曀 -灔 -崷 -趂 -徯 -闃 -洧 -獪 -稏 -煢 -歈 -呶 -壈 -襃 -旴 -檟 -簦 -谽 -箵 -謡 -慝 -餖 -稌 -朣 -禖 -嚀 -嵂 -黷 -甖 -洑 -釡 -蕝 -甆 -翣 -篸 -隮 -滃 -裌 -蔀 -籖 -秬 -鷴 -啅 -慼 -捄 -咮 -睟 -譎 -嘷 -駃 -檥 -蹐 -窊 -駞 -雘 -趯 -篲 -讋 -睍 -毰 -憗 -鳷 -嚲 -圞 -歃 -緼 -賫 -籋 -繐 -麏 -灕 -礧 -歜 -飇 -鵁 -疢 -賖 -窆 -螮 -毹 -硉 -耡 -甔 -篛 -娭 -髩 -燋 -輜 -籧 -撝 -嬙 -徃 -驦 -𡏖 -麕 -馹 -覔 -鶠 -褷 -綍 -螗 -嗈 -彯 -篨 -炰 -鄮 -噞 -尅 -鷰 -鋭 -饉 -霢 -䔩 -坱 -裓 -帨 -忺 -豅 -栱 -謦 -傯 -誵 -骭 -潩 -鬒 -嵫 -悮 -扊 -扅 -轢 -惝 -臈 -舃 -鞾 -譟 -袵 -眎 -簏 -埸 -堧 -憸 -雰 -腷 -嵓 -隩 -趄 -墐 -褦 -艑 -狴 -玿 -竪 -恧 -姱 -抆 -恓 -霣 -躓 -鞲 -晬 -簴 -唼 -曵 -褕 -罣 -縐 -衘 -巃 -攲 -輀 -貎 -哳 -醭 -鋣 -僛 -迕 -蠭 -膓 -欝 -洊 -敺 -纎 -栟 -鞓 -蛬 -灺 -痏 -恡 -踸 -霔 -濵 -昻 -鉘 -楖 -竛 -竮 -窱 -幬 -慤 -儗 -黮 -嘐 -睆 -頇 -麑 -壼 -㦸 -顋 -瘥 -苖 -韈 -盻 -袷 -矼 -塼 -鐍 -傞 -苶 -吷 -噇 -鶖 -僣 -髧 -䅉 -鯫 -襏 -縳 -蠨 -痡 -髽 -剉 -蝱 -鄽 -匼 -嚚 -襫 -缿 -鵊 -燖 -忸 -摝 -攅 -牷 -氎 -騣 -颿 -虡 -腯 -漘 -矓 -祫 -顢 -綀 -弮 -柙 -蔾 -胾 -筤 -馽 -砆 -冩 -賙 -唶 -麛 -簜 -蹏 -屼 -鞶 -煑 -踠 -愀 -蠒 -頬 -韲 -戞 -畆 -笊 -搨 -捽 -絙 -覉 -澨 -趫 -矹 -穮 -愠 -劘 -轣 -卭 -鼪 -杕 -轗 -擐 -蚿 -恚 -檛 -𩕳 -靆 -轕 -餼 -頮 -槹 -蔉 -皜 -扄 -鮆 -轑 -蹡 -嵽 -甎 -蟈 -橅 -笴 -膰 -蕣 -澘 -髿 -樕 -褵 -蜋 -窼 -櫧 -雊 -胷 -嘵 -麄 -裋 -繢 -啐 -臛 -簁 -巓 -羜 -攧 -簮 -壊 -齩 -晹 -臲 -鬵 -齠 -媮 -幮 -壍 -蠛 -槜 -羖 -窓 -隃 -嚘 -輳 -籹 -凴 -崕 -獍 -嗸 -趦 -囅 -戺 -涬 -諉 -箯 -輊 -桹 -嵷 -㲲 -愊 -蒱 -洚 -赩 -輴 -幈 -齔 -嗁 -阽 -圠 -荈 -碔 -揎 -巀 -洏 -卼 -𨁝 -痁 -黳 -嗾 -䆗 -戃 -蕆 -頋 -悤 -掎 -㯝 -吚 -猘 -鮎 -鬴 -墁 -飋 -呿 -窀 -沲 -枒 -窌 -爼 -頞 -譡 -鶋 -湩 -㦬 -僾 -斒 -醼 -鶂 -磥 -揫 -犗 -齗 -鄶 -囏 -崪 -爞 -籓 -斮 -觝 -嵸 -驔 -䨴 -頺 -萑 -珓 -牸 -闒 -凘 -悢 -蟭 -濈 -嬄 -翽 -旍 -鶢 -罋 -輠 -怩 -頖 -趍 -壝 -嫮 -蕋 -踦 -轇 -眘 -巗 -嶭 -糓 -甽 -籺 -矟 -佖 -絏 -憮 -懡 -駈 -擕 -淟 -皡 -膋 -潨 -鳲 -趠 -麨 -頩 -漻 -輗 -墄 -賮 -㴩 -莟 -縦 -岝 -醻 -曚 -䙰 -噭 -醥 -筰 -躧 -踘 -鑕 -咈 -僶 -鶊 -鬂 -聼 -騐 -壒 -磎 -歗 -淈 -隟 -狃 -縋 -媻 -趲 -惙 -呫 -聮 -羾 -尫 -佽 -髼 -繋 -鬘 -旜 -疐 -阬 -䰐 -塈 -徤 -祊 -灂 -祅 -樷 -颾 -凟 -頀 -蠏 -塒 -衹 -婬 -裩 -粞 -憯 -匭 -筈 -盬 -霮 -黕 -靮 -伻 -緺 -瘝 -羑 -醸 -樝 -僎 -絓 -噆 -愞 -痗 -愽 -岊 -黤 -湑 -搉 -綯 -焮 -疉 -楛 -玼 -喤 -粔 -飂 -贐 -緉 -覰 -胔 -鞳 -摑 -墢 -斅 -誶 -僝 -鹺 -諌 -齅 -嵼 -讟 -冦 -脝 -婣 -緐 -茰 -飶 -欎 -慁 -抝 -瘉 -𡎺 -鈯 -瘃 -麫 -匊 -窞 -羓 -氄 -嚌 -姤 -橑 -駬 -冺 -騠 -㕙 -楶 -靸 -圎 -尀 -䙀 -鄏 -軃 -竁 -㹠 -刜 -剨 -罛 -鏹 -鬉 -簨 -藭 -藷 -僇 -瀫 -袨 -忮 -冡 -齯 -楪 -囋 -蟉 -醱 -尠 -牣 -攟 -袿 -齾 -甞 -啑 -潚 -樐 -絖 -酖 -觖 -骹 -嶅 -玃 -嫜 -廹 -儤 -矂 -艓 -挱 -骳 -嵳 -洴 -礓 -厪 -﨑 -禜 -籊 -瓻 -彛 -狁 -腪 -骾 -娯 -罻 -璅 -簳 -姢 -猰 -眹 -䴥 -堘 -搯 -怤 -緫 -聫 -涊 -熛 -輤 -䡾 -譌 -髇 -攛 -稭 -媕 -鬷 -跰 -縚 -鉧 -踧 -嚭 -襮 -藞 -滮 -颷 -荂 -蓰 -怫 -閧 -臕 -稛 -怗 -闑 -抶 -薶 -嶕 -瀺 -𥫗 -墝 -埆 -皥 -惷 -鞔 -鞺 -蟛 -瀡 -鎁 -酧 -恝 -齓 -嚄 -簔 -蟺 -㔶 -胹 -憖 -惄 -鸒 -貛 -軏 -縗 -蓻 -娵 -抺 -鼛 -虩 -歕 -矑 -繂 -襚 -倂 -廥 -諝 -虗 -弜 -兟 -繿 -偘 -翶 -肻 -棙 -斆 -碨 -醎 -蟢 -銙 -躠 -櫩 -椮 -絀 -鷾 -溳 -詖 -葓 -谼 -𦩘 -韔 -翿 -呑 -馡 -騊 -堁 -蓏 -䟃 -頟 -渢 -趑 -堄 -滛 -擫 -豭 -騩 -騘 -䍧 -彍 -忭 -餙 -馺 -忩 -芣 -矴 -噂 -滍 -慫 -𨍏 -怲 -扵 -搊 -昩 -嶻 -禬 -憃 -忼 -榰 -箾 -撁 -鈆 -袗 -脤 -騞 -哤 -螀 -靧 -梲 -囦 -魖 -褠 -䭔 -煆 -挃 -宷 -熉 -朘 -憭 -䒠 -謭 -鷤 -拕 -骫 -穾 -襭 -喓 -冞 -勩 -媢 -麚 -椓 -俙 -幐 -磝 -蜎 -灙 -漦 -㛹 -䭀 -㜷 -粻 -懟 -箳 -滣 -糉 -冐 -韤 -湱 -糭 -栳 -勌 -慱 -㸌 -罫 -筞 -霿 -躶 -玞 -磉 -罦 -祴 -媟 -猒 -擭 -恠 -嵁 -屴 -慆 -庬 -蟁 -㹀 -薧 -鷕 -渻 -朂 -愯 -齚 -蝻 -胏 -饙 -鳦 -鸃 -叅 -肧 -篂 -脗 -雺 -飰 -筀 -頥 -毶 -弌 -逓 -瞍 -絁 -鏚 -嚵 -攂 -醄 -奼 -獫 -絣 -靷 -畮 -褉 -棁 -揑 -楥 -橤 -襥 -蹮 -窔 -忪 -潠 -杇 -黲 -擄 -蚻 -蘙 -虙 -袐 -陿 -帊 -醟 -髖 -㞦 -鱭 -譸 -鮞 -栧 -扺 -脽 -擉 -岨 -黈 -餻 -佪 -遻 -鋟 -瞶 -廽 -懨 -墖 -玁 -籉 -宼 -鰋 -瑽 -垖 -酕 -漰 -戹 -蝛 -瑲 -阤 -褆 -儛 -䍽 -觕 -箘 -碯 -灨 -燀 -膇 -韀 -䳏 -詿 -禂 -韣 -踡 -碏 -尵 -莭 -庻 -篿 -狤 -㘞 -艭 -殱 -鵔 -槮 -猧 -劙 -獝 -㭊 -㾕 -蠚 -帤 -蹢 -蛚 -輼 -麀 -檃 -䰒 -䪫 -悾 -濳 -杗 -揾 -駏 -撦 -耈 -蟟 -狌 -鸖 -䨥 -餫 -鍰 -耉 -毚 -袽 -䱥 -慓 -䔿 -艖 -舋 -弰 -蠺 -嫓 -蚳 -髾 -喨 -鴐 -䍥 -韍 -柹 -掁 -薋 -攕 -飺 -凢 -麌 -嫰 -鑚 -黦 -葠 -吿 -栰 -踶 -芿 -穭 -啝 -筓 -褁 -稇 -顚 -䎘 -恇 -珷 -緪 -墠 -蛣 -蛜 -讕 -疻 -惎 -袝 -霡 -罸 -鬽 -苢 -喭 -飊 -唎 -澾 -襋 -皭 -廏 -蔿 -穊 -䝟 -駊 -獹 -夣 -褾 -慴 -軥 -讁 -軰 -瞷 -𡋯 -晜 -潗 -衋 -揵 -覼 -鱐 -醡 -䏰 -侐 -亁 -桞 -驘 -鬋 -鷽 -懞 -㵳 -儳 -豝 -傺 -搒 -縧 -硾 -䏶 -覻 -薍 -憝 -榠 -湆 -皵 -鎞 -菆 -糇 -矉 -搤 -紃 -峿 -磹 -甒 -琭 -𩥇 -菢 -禡 -渹 -刅 -迒 -敂 -蹜 -磓 -傪 -縿 -㕮 -涏 -䰀 -㡛 -韛 -犠 -餦 -圝 -焫 -㝢 -潬 -馵 -澟 -鱏 -譾 -㪍 -煼 -鍜 -窽 -紾 -堨 -䕸 -穅 -戅 -穄 -駴 -偫 -煗 -媠 -酘 -矬 -貆 -茞 -骩 -扠 -岞 -潓 -炧 -陊 -栭 -釱 -㡚 -篴 -耞 -鞉 -䋏 -𤫩 -椸 -儜 -痀 -謷 -潙 -寠 -牐 -嫭 -慅 -獧 -鈒 -欿 -薳 -蟂 -郲 -軨 -斨 -訦 -𠴲 -剺 -駪 -贙 -禫 -噣 -茢 -茙 -鄼 -揷 -魌 -䫻 -嗋 -噐 -侲 -諵 -𠺕 -挍 -䑳 -㨷 -槸 -靘 -㩧 -虣 -瑿 -衱 -襹 -餭 -㗶 -枑 -悋 -纑 -嶫 -儓 -髵 -甗 -榝 -㗭 -贗 -熸 -嬃 -礌 -偭 -樠 -栮 -鷼 -鵀 -澬 -眂 -牿 -骴 -呞 -爕 -牎 -巹 -帉 -砠 -梴 -䛏 -攃 -餁 -哿 -蹝 -崺 -閌 -醝 -臡 -麖 -駼 -賵 -夘 -骻 -愡 -俔 -諐 -觩 -莂 -饈 -殣 -溠 -冱 -埓 -厫 -虥 -芄 -慽 -竃 -埿 -仭 -褼 -倛 -韸 -牗 -幖 -禈 -穧 -蜧 -諞 -脞 -蝃 -飃 -煁 -涒 -谾 -覢 -赮 -鼘 -艗 -䶉 -鴥 -轒 -睅 -傔 -惵 -唈 -懆 -磣 -膢 -堶 -囈 -瘕 -誷 -瑘 -絝 -鬈 -嘽 -鷅 -梜 -喎 -鼟 -㟧 -劻 -眑 -剴 -痎 -餟 -庌 -菷 -梐 -吺 -躘 -慞 -罼 -穨 -摏 -釄 -莋 -呺 -砅 -鴽 -㘭 -㟅 -艴 -犉 -籕 -跐 -惏 -陗 -刋 -襘 -醹 -紽 -痌 -㗀 -撋 -陼 -駷 -艼 -踼 -癏 -慠 -趒 -邍 -姞 -䂬 -堲 -苙 -椌 -嗃 -挶 -岯 -禗 -嵔 -觡 -豜 -睩 -㒿 -塠 -燂 -扤 -恟 -鬝 -鬇 -鬡 -揳 -霠 -㗫 -苐 -蒀 -圌 -戭 -䖃 -𥈭 -勮 -耝 -轞 -胮 -墯 -枮 -罿 -浺 -綪 -爓 -蘃 -襍 -轜 -閠 -畽 -鄊 -嶆 -籭 -蠯 -陑 -瘽 -迆 -賷 -䍡 -韂 -躃 -禴 -簄 -瓟 -碐 -躨 -侜 -岍 -䃸 -趚 -髐 -榅 -粣 -屝 -鴃 -圁 -蝜 -黫 -僽 -丗 -靣 -湏 -抏 -㟽 -跙 -餤 -朙 -㹞 -瞖 -繣 -㨫 -罙 -糒 -惉 -葽 -鼮 -蕳 -豏 -𥱼 -鵮 -獦 -悕 -𠴨 -闟 -惽 -慿 -隉 -椷 -𩅰 -艛 -眽 -凓 -儃 -奨 -埀 -瑫 -駚 -濇 -緶 -峉 -礨 -髢 -瞯 -壥 -姡 -㟯 -髬 -啀 -㶿 -歅 -殀 -縩 -疈 -鳸 -霳 -稬 -圊 -彚 -裠 -埳 -褋 -㔩 -矲 -剶 -硋 -聦 -峞 -浰 -窵 -嘂 -睘 -簵 -腒 -韘 -躣 -甈 -忲 -舽 -襂 -硠 -脃 -鐏 -奯 -脧 -矕 -䠞 -駹 -豶 -訑 -柸 -鰅 -瘨 -趿 -糦 -蟏 -饛 -尰 -諑 -汃 -毺 -鋃 -絚 -馧 -艬 -枍 -爊 -峗 -泙 -碖 -鵕 -尩 -閗 -𤧚 -幩 -塉 -箊 -覂 -玒 -橧 -謟 -庨 -籔 -欑 -厎 -尭 -氉 -蠈 -䓞 -矙 -梡 -瀩 -溔 -煴 -蔲 -僬 -嵢 -梩 -弝 -𣙙 -鞟 -敉 -鮚 -湠 -鐐 -爣 -裻 -䶎 -𦨴 -謿 -垾 -蝂 -睂 -癙 -韽 -㟳 -桒 -鳿 -樏 -峛 -瑉 -僄 -顣 -衺 -殗 -肦 -圑 -朒 -喌 -犦 -㰅 -疁 -氃 -吰 -陻 -盰 -娀 -魶 -㖃 -曒 -娿 -獱 -孏 -酅 -蝡 -齰 -莬 -鄀 -逥 -挿 -觵 -縆 -㟝 -繍 -碙 -㑂 -䎳 -兾 -壸 -賝 -桯 -跁 -跒 -蔍 -舼 -忀 -懭 -媌 -罭 -菵 -狔 -靿 -拪 -㲉 -䔲 -嬀 -鵽 -涳 -朾 -𡸣 -𢫫 -虈 -㜮 -顑 -櫋 -蔪 -旝 -湡 -蹛 -稆 -唽 -㟏 -熂 -龡 -煟 -韅 -韐 -慂 -剳 -掫 -兠 -摋 -羫 -璊 -鵻 -駓 -佌 -蜹 -晲 -矒 -玅 -剰 -斶 -紖 -懴 -駜 -羢 -麳 -㳷 -馞 -爥 -鍚 -鑢 -螵 -嗺 -鏨 -𠙶 -疪 -鷔 -鮧 -轊 -栘 -鼜 -睗 -蟘 -枓 -䖟 -剠 -瞤 -圛 -椳 -籸 -䪌 -鯹 -湌 -丳 -賧 -縭 -檾 -𦨻 -撆 -䩫 -磢 -惥 -譀 -罤 -鞸 -鉎 -㶏 -膁 -甋 -瓀 -懹 -槢 -硊 -弆 -琫 -嵠 -駻 -湢 -杮 -䌨 -訹 -藇 -穯 -蠉 -曭 -蹎 -詄 -毷 -𩃎 -熁 -灜 -蜫 -蜳 -昈 -帩 -鈋 -䐹 -顖 -鄹 -匶 -毾 -礜 -堭 -婞 -鷿 -㙞 -詀 -瘮 -䫜 -㾪 -捘 -屫 -誧 -䲔 -閍 -蒳 -㬋 -遟 -嶀 -葐 -蜼 -㻱 -曡 -䃜 -濴 -䦱 -霫 -譆 -霋 -蕰 -襓 -氋 -鴷 -魦 -㩻 -㡠 -灉 -贑 -燑 -峝 -輷 -烻 -耼 -螉 -跜 -豩 -㑃 -藙 -鋂 -胐 -𣔻 -紒 -瓓 -塯 -辴 -趷 -堛 -㒟 -㗲 -㬊 -䄡 -卄 -姧 -猓 -躗 -覤 -醊 -兎 -罯 -痯 -覸 -詉 -癿 -岋 -歝 -茟 -㘆 -㮰 -淜 -𥉌 -㫰 -鈌 -毵 -狉 -贜 -峬 -汻 -誖 -烓 -睋 -潎 -䲺 -㠓 -歖 -𠜱 -槵 -熚 -萷 -磤 -絸 -鷞 -聻 -屷 -㝵 -諕 -瘂 -㺷 -蚰 -柦 -䍐 -泿 -礰 -摎 -㜕 -㻞 -洓 -喍 -囌 -囐 -䙱 -腨 -妉 -鄛 -鄥 -㵝 -輧 -鱄 -騟 -鈚 -廜 -𨗨 -㶼 -膞 -崯 -硞 -萆 -眒 -譩 -揬 -藑 -匌 -㠾 -㥏 -㢮 -䕢 -帣 -酭 -枦 -孅 -鞙 -丷 -鍭 -䤴 -餂 -愗 -冘 -埛 -㒇 -郕 -蔯 -簰 -刔 -蠩 -耏 -鞹 -𧑅 -觹 -䐑 -磶 -蹵 -鵃 -耛 -蓤 -臄 -轙 -庤 -㒩 -翐 -榥 -晀 -輣 -蟚 -拲 -皠 -穱 -䃔 -䃧 -窡 -絍 -礿 -鑞 -栯 -㾓 -掿 -厞 -淂 -撶 -伹 -鹻 -軓 -岹 -蚷 -榸 -刾 -艂 -㤝 -塕 -蚔 -藾 -攓 -鏬 -珫 -黪 -蟧 -猭 -漑 -粺 -驆 -撘 -亾 -㼌 -蝑 -澓 -揞 -欱 -愶 -泲 -醷 -螴 -芚 -絻 -轃 -漮 -唪 -岉 -鬀 -䱹 -齖 -䂓 -趢 -荓 -覶 -鯾 -諿 -槥 -嚆 -爢 -瓬 -笐 -篢 -舝 -襵 -鎒 -𤝞 -肭 -瘇 -笓 -餑 -豋 -湗 -緎 -肐 -胲 -掤 -潫 -䖴 -𠎝 -𨺗 -諢 -毈 -寱 -唲 -䃭 -峮 -狘 -韊 -䬝 -呰 -㹱 -碞 -畞 -㠌 -黭 -蚘 -豵 -穥 -尯 -㳇 -隵 -灇 -壜 -楰 -彲 -甤 -綹 -旞 -𡏟 -曁 -喩 -𥲤 -郈 -塺 -訧 -絿 -掔 -蠮 -𡱰 -䃺 -宻 -灎 -羵 -𨠵 -糚 -摉 -壷 -勴 -瑃 -鎝 -𥜥 -婥 -鬺 -扢 -肣 -溰 -磩 -耇 -宎 -㔇 -霱 -敚 -汳 -鏄 -儹 -隥 -㿉 -膆 -崏 -𦭵 -郔 -扂 -垗 -㳂 -礛 -缻 -垜 -晱 -訩 -蘪 -珇 -怮 -垝 -㔢 -憛 -痝 -蟨 -鞁 -鶤 -肎 -傝 -䢆 -䰄 -𥊚 -㖀 -㠭 -壵 -墋 -㠔 -橜 -怓 -蚹 -塛 -憪 -鋝 -腶 -嶾 -翍 -溓 -齼 -蔂 -䃂 -鉺 -攑 -瓐 -泎 -眤 -邘 -崝 -稡 -愸 -髥 -輹 -詨 -髆 -麃 -虤 -洐 -婐 -挏 -峑 -嶣 -篬 -葄 -瑎 -瓉 -㳅 -葼 -姙 -䪜 -𩇕 -焭 -剚 -濪 -霵 -僒 - -羭 diff --git a/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/utils/resources/dict/ppocrv5_dict.txt b/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/utils/resources/dict/ppocrv5_dict.txt deleted file mode 100644 index 567898b49de2707853454682f05e0c504c0085b9..0000000000000000000000000000000000000000 --- a/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/utils/resources/dict/ppocrv5_dict.txt +++ /dev/null @@ -1,18383 +0,0 @@ -  -一 -乙 -二 -十 -丁 -厂 -七 -卜 -八 -人 -入 -儿 -匕 -几 -九 -刁 -了 -刀 -力 -乃 -又 -三 -干 -于 -亏 -工 -土 -士 -才 -下 -寸 -大 -丈 -与 -万 -上 -小 -口 -山 -巾 -千 -乞 -川 -亿 -个 -夕 -久 -么 -勺 -凡 -丸 -及 -广 -亡 -门 -丫 -义 -之 -尸 -己 -已 -巳 -弓 -子 -卫 -也 -女 -刃 -飞 -习 -叉 -马 -乡 -丰 -王 -开 -井 -天 -夫 -元 -无 -云 -专 -丐 -扎 -艺 -木 -五 -支 -厅 -不 -犬 -太 -区 -历 -歹 -友 -尤 -匹 -车 -巨 -牙 -屯 -戈 -比 -互 -切 -瓦 -止 -少 -曰 -日 -中 -贝 -冈 -内 -水 -见 -午 -牛 -手 -气 -毛 -壬 -升 -夭 -长 -仁 -什 -片 -仆 -化 -仇 -币 -仍 -仅 -斤 -爪 -反 -介 -父 -从 -仑 -今 -凶 -分 -乏 -公 -仓 -月 -氏 -勿 -欠 -风 -丹 -匀 -乌 -勾 -凤 -六 -文 -亢 -方 -火 -为 -斗 -忆 -计 -订 -户 -认 -冗 -讥 -心 -尺 -引 -丑 -巴 -孔 -队 -办 -以 -允 -予 -邓 -劝 -双 -书 -幻 -玉 -刊 -未 -末 -示 -击 -打 -巧 -正 -扑 -卉 -扒 -功 -扔 -去 -甘 -世 -艾 -古 -节 -本 -术 -可 -丙 -左 -厉 -石 -右 -布 -夯 -戊 -龙 -平 -灭 -轧 -东 -卡 -北 -占 -凸 -卢 -业 -旧 -帅 -归 -旦 -目 -且 -叶 -甲 -申 -叮 -电 -号 -田 -由 -只 -叭 -史 -央 -兄 -叽 -叼 -叫 -叩 -叨 -另 -叹 -冉 -皿 -凹 -囚 -四 -生 -矢 -失 -乍 -禾 -丘 -付 -仗 -代 -仙 -们 -仪 -白 -仔 -他 -斥 -瓜 -乎 -丛 -令 -用 -甩 -印 -尔 -乐 -句 -匆 -册 -卯 -犯 -外 -处 -冬 -鸟 -务 -包 -饥 -主 -市 -立 -冯 -玄 -闪 -兰 -半 -汁 -汇 -头 -汉 -宁 -穴 -它 -讨 -写 -让 -礼 -训 -议 -必 -讯 -记 -永 -司 -尼 -民 -弗 -弘 -出 -辽 -奶 -奴 -召 -加 -皮 -边 -孕 -发 -圣 -对 -台 -矛 -纠 -母 -幼 -丝 -邦 -式 -迂 -刑 -戎 -动 -扛 -寺 -吉 -扣 -考 -托 -老 -巩 -圾 -执 -扩 -扫 -地 -场 -扬 -耳 -芋 -共 -芒 -亚 -芝 -朽 -朴 -机 -权 -过 -臣 -吏 -再 -协 -西 -压 -厌 -戌 -在 -百 -有 -存 -而 -页 -匠 -夸 -夺 -灰 -达 -列 -死 -成 -夹 -夷 -轨 -邪 -尧 -划 -迈 -毕 -至 -此 -贞 -师 -尘 -尖 -劣 -光 -当 -早 -吁 -吐 -吓 -虫 -曲 -团 -吕 -同 -吊 -吃 -因 -吸 -吗 -吆 -屿 -屹 -岁 -帆 -回 -岂 -则 -刚 -网 -肉 -年 -朱 -先 -丢 -廷 -舌 -竹 -迁 -乔 -迄 -伟 -传 -乒 -乓 -休 -伍 -伏 -优 -臼 -伐 -延 -仲 -件 -任 -伤 -价 -伦 -份 -华 -仰 -仿 -伙 -伪 -自 -伊 -血 -向 -似 -后 -行 -舟 -全 -会 -杀 -合 -兆 -企 -众 -爷 -伞 -创 -肌 -肋 -朵 -杂 -危 -旬 -旨 -旭 -负 -匈 -名 -各 -多 -争 -色 -壮 -冲 -妆 -冰 -庄 -庆 -亦 -刘 -齐 -交 -衣 -次 -产 -决 -亥 -充 -妄 -闭 -问 -闯 -羊 -并 -关 -米 -灯 -州 -汗 -污 -江 -汛 -池 -汝 -汤 -忙 -兴 -宇 -守 -宅 -字 -安 -讲 -讳 -军 -讶 -许 -讹 -论 -讼 -农 -讽 -设 -访 -诀 -寻 -那 -迅 -尽 -导 -异 -弛 -孙 -阵 -阳 -收 -阶 -阴 -防 -奸 -如 -妇 -妃 -好 -她 -妈 -戏 -羽 -观 -欢 -买 -红 -驮 -纤 -驯 -约 -级 -纪 -驰 -纫 -巡 -寿 -弄 -麦 -玖 -玛 -形 -进 -戒 -吞 -远 -违 -韧 -运 -扶 -抚 -坛 -技 -坏 -抠 -扰 -扼 -拒 -找 -批 -址 -扯 -走 -抄 -贡 -汞 -坝 -攻 -赤 -折 -抓 -扳 -抡 -扮 -抢 -孝 -坎 -均 -抑 -抛 -投 -坟 -坑 -抗 -坊 -抖 -护 -壳 -志 -块 -扭 -声 -把 -报 -拟 -却 -抒 -劫 -芙 -芜 -苇 -芽 -花 -芹 -芥 -芬 -苍 -芳 -严 -芦 -芯 -劳 -克 -芭 -苏 -杆 -杠 -杜 -材 -村 -杖 -杏 -杉 -巫 -极 -李 -杨 -求 -甫 -匣 -更 -束 -吾 -豆 -两 -酉 -丽 -医 -辰 -励 -否 -还 -尬 -歼 -来 -连 -轩 -步 -卤 -坚 -肖 -旱 -盯 -呈 -时 -吴 -助 -县 -里 -呆 -吱 -吠 -呕 -园 -旷 -围 -呀 -吨 -足 -邮 -男 -困 -吵 -串 -员 -呐 -听 -吟 -吩 -呛 -吻 -吹 -呜 -吭 -吧 -邑 -吼 -囤 -别 -吮 -岖 -岗 -帐 -财 -针 -钉 -牡 -告 -我 -乱 -利 -秃 -秀 -私 -每 -兵 -估 -体 -何 -佐 -佑 -但 -伸 -佃 -作 -伯 -伶 -佣 -低 -你 -住 -位 -伴 -身 -皂 -伺 -佛 -囱 -近 -彻 -役 -返 -余 -希 -坐 -谷 -妥 -含 -邻 -岔 -肝 -肛 -肚 -肘 -肠 -龟 -甸 -免 -狂 -犹 -狈 -角 -删 -条 -彤 -卵 -灸 -岛 -刨 -迎 -饭 -饮 -系 -言 -冻 -状 -亩 -况 -床 -库 -庇 -疗 -吝 -应 -这 -冷 -庐 -序 -辛 -弃 -冶 -忘 -闰 -闲 -间 -闷 -判 -兑 -灶 -灿 -灼 -弟 -汪 -沐 -沛 -汰 -沥 -沙 -汽 -沃 -沦 -汹 -泛 -沧 -没 -沟 -沪 -沈 -沉 -沁 -怀 -忧 -忱 -快 -完 -宋 -宏 -牢 -究 -穷 -灾 -良 -证 -启 -评 -补 -初 -社 -祀 -识 -诈 -诉 -罕 -诊 -词 -译 -君 -灵 -即 -层 -屁 -尿 -尾 -迟 -局 -改 -张 -忌 -际 -陆 -阿 -陈 -阻 -附 -坠 -妓 -妙 -妖 -姊 -妨 -妒 -努 -忍 -劲 -矣 -鸡 -纬 -驱 -纯 -纱 -纲 -纳 -驳 -纵 -纷 -纸 -纹 -纺 -驴 -纽 -奉 -玩 -环 -武 -青 -责 -现 -玫 -表 -规 -抹 -卦 -坷 -坯 -拓 -拢 -拔 -坪 -拣 -坦 -担 -坤 -押 -抽 -拐 -拖 -者 -拍 -顶 -拆 -拎 -拥 -抵 -拘 -势 -抱 -拄 -垃 -拉 -拦 -幸 -拌 -拧 -拂 -拙 -招 -坡 -披 -拨 -择 -抬 -拇 -拗 -其 -取 -茉 -苦 -昔 -苛 -若 -茂 -苹 -苗 -英 -苟 -苑 -苞 -范 -直 -茁 -茄 -茎 -苔 -茅 -枉 -林 -枝 -杯 -枢 -柜 -枚 -析 -板 -松 -枪 -枫 -构 -杭 -杰 -述 -枕 -丧 -或 -画 -卧 -事 -刺 -枣 -雨 -卖 -郁 -矾 -矿 -码 -厕 -奈 -奔 -奇 -奋 -态 -欧 -殴 -垄 -妻 -轰 -顷 -转 -斩 -轮 -软 -到 -非 -叔 -歧 -肯 -齿 -些 -卓 -虎 -虏 -肾 -贤 -尚 -旺 -具 -味 -果 -昆 -国 -哎 -咕 -昌 -呵 -畅 -明 -易 -咙 -昂 -迪 -典 -固 -忠 -呻 -咒 -咋 -咐 -呼 -鸣 -咏 -呢 -咄 -咖 -岸 -岩 -帖 -罗 -帜 -帕 -岭 -凯 -败 -账 -贩 -贬 -购 -贮 -图 -钓 -制 -知 -迭 -氛 -垂 -牧 -物 -乖 -刮 -秆 -和 -季 -委 -秉 -佳 -侍 -岳 -供 -使 -例 -侠 -侥 -版 -侄 -侦 -侣 -侧 -凭 -侨 -佩 -货 -侈 -依 -卑 -的 -迫 -质 -欣 -征 -往 -爬 -彼 -径 -所 -舍 -金 -刹 -命 -肴 -斧 -爸 -采 -觅 -受 -乳 -贪 -念 -贫 -忿 -肤 -肺 -肢 -肿 -胀 -朋 -股 -肮 -肪 -肥 -服 -胁 -周 -昏 -鱼 -兔 -狐 -忽 -狗 -狞 -备 -饰 -饱 -饲 -变 -京 -享 -庞 -店 -夜 -庙 -府 -底 -疟 -疙 -疚 -剂 -卒 -郊 -庚 -废 -净 -盲 -放 -刻 -育 -氓 -闸 -闹 -郑 -券 -卷 -单 -炬 -炒 -炊 -炕 -炎 -炉 -沫 -浅 -法 -泄 -沽 -河 -沾 -泪 -沮 -油 -泊 -沿 -泡 -注 -泣 -泞 -泻 -泌 -泳 -泥 -沸 -沼 -波 -泼 -泽 -治 -怔 -怯 -怖 -性 -怕 -怜 -怪 -怡 -学 -宝 -宗 -定 -宠 -宜 -审 -宙 -官 -空 -帘 -宛 -实 -试 -郎 -诗 -肩 -房 -诚 -衬 -衫 -视 -祈 -话 -诞 -诡 -询 -该 -详 -建 -肃 -录 -隶 -帚 -屉 -居 -届 -刷 -屈 -弧 -弥 -弦 -承 -孟 -陋 -陌 -孤 -陕 -降 -函 -限 -妹 -姑 -姐 -姓 -妮 -始 -姆 -迢 -驾 -叁 -参 -艰 -线 -练 -组 -绅 -细 -驶 -织 -驹 -终 -驻 -绊 -驼 -绍 -绎 -经 -贯 -契 -贰 -奏 -春 -帮 -玷 -珍 -玲 -玻 -毒 -型 -拭 -挂 -封 -持 -拷 -拱 -项 -垮 -挎 -城 -挟 -挠 -政 -赴 -赵 -挡 -拽 -哉 -挺 -括 -垢 -拴 -拾 -挑 -垛 -指 -垫 -挣 -挤 -拼 -挖 -按 -挥 -挪 -拯 -某 -甚 -荆 -茸 -革 -茬 -荐 -巷 -带 -草 -茧 -茵 -茶 -荒 -茫 -荡 -荣 -荤 -荧 -故 -胡 -荫 -荔 -南 -药 -标 -栈 -柑 -枯 -柄 -栋 -相 -查 -柏 -栅 -柳 -柱 -柿 -栏 -柠 -树 -勃 -要 -柬 -咸 -威 -歪 -研 -砖 -厘 -厚 -砌 -砂 -泵 -砚 -砍 -面 -耐 -耍 -牵 -鸥 -残 -殃 -轴 -轻 -鸦 -皆 -韭 -背 -战 -点 -虐 -临 -览 -竖 -省 -削 -尝 -昧 -盹 -是 -盼 -眨 -哇 -哄 -哑 -显 -冒 -映 -星 -昨 -咧 -昭 -畏 -趴 -胃 -贵 -界 -虹 -虾 -蚁 -思 -蚂 -虽 -品 -咽 -骂 -勋 -哗 -咱 -响 -哈 -哆 -咬 -咳 -咪 -哪 -哟 -炭 -峡 -罚 -贱 -贴 -贻 -骨 -幽 -钙 -钝 -钞 -钟 -钢 -钠 -钥 -钦 -钧 -钩 -钮 -卸 -缸 -拜 -看 -矩 -毡 -氢 -怎 -牲 -选 -适 -秒 -香 -种 -秋 -科 -重 -复 -竿 -段 -便 -俩 -贷 -顺 -修 -俏 -保 -促 -俄 -俐 -侮 -俭 -俗 -俘 -信 -皇 -泉 -鬼 -侵 -禹 -侯 -追 -俊 -盾 -待 -徊 -衍 -律 -很 -须 -叙 -剑 -逃 -食 -盆 -胚 -胧 -胆 -胜 -胞 -胖 -脉 -胎 -勉 -狭 -狮 -独 -狰 -狡 -狱 -狠 -贸 -怨 -急 -饵 -饶 -蚀 -饺 -饼 -峦 -弯 -将 -奖 -哀 -亭 -亮 -度 -迹 -庭 -疮 -疯 -疫 -疤 -咨 -姿 -亲 -音 -帝 -施 -闺 -闻 -闽 -阀 -阁 -差 -养 -美 -姜 -叛 -送 -类 -迷 -籽 -娄 -前 -首 -逆 -兹 -总 -炼 -炸 -烁 -炮 -炫 -烂 -剃 -洼 -洁 -洪 -洒 -柒 -浇 -浊 -洞 -测 -洗 -活 -派 -洽 -染 -洛 -浏 -济 -洋 -洲 -浑 -浓 -津 -恃 -恒 -恢 -恍 -恬 -恤 -恰 -恼 -恨 -举 -觉 -宣 -宦 -室 -宫 -宪 -突 -穿 -窃 -客 -诫 -冠 -诬 -语 -扁 -袄 -祖 -神 -祝 -祠 -误 -诱 -诲 -说 -诵 -垦 -退 -既 -屋 -昼 -屏 -屎 -费 -陡 -逊 -眉 -孩 -陨 -除 -险 -院 -娃 -姥 -姨 -姻 -娇 -姚 -娜 -怒 -架 -贺 -盈 -勇 -怠 -癸 -蚤 -柔 -垒 -绑 -绒 -结 -绕 -骄 -绘 -给 -绚 -骆 -络 -绝 -绞 -骇 -统 -耕 -耘 -耗 -耙 -艳 -泰 -秦 -珠 -班 -素 -匿 -蚕 -顽 -盏 -匪 -捞 -栽 -捕 -埂 -捂 -振 -载 -赶 -起 -盐 -捎 -捍 -捏 -埋 -捉 -捆 -捐 -损 -袁 -捌 -都 -哲 -逝 -捡 -挫 -换 -挽 -挚 -热 -恐 -捣 -壶 -捅 -埃 -挨 -耻 -耿 -耽 -聂 -恭 -莽 -莱 -莲 -莫 -莉 -荷 -获 -晋 -恶 -莹 -莺 -真 -框 -梆 -桂 -桔 -栖 -档 -桐 -株 -桥 -桦 -栓 -桃 -格 -桩 -校 -核 -样 -根 -索 -哥 -速 -逗 -栗 -贾 -酌 -配 -翅 -辱 -唇 -夏 -砸 -砰 -砾 -础 -破 -原 -套 -逐 -烈 -殊 -殉 -顾 -轿 -较 -顿 -毙 -致 -柴 -桌 -虑 -监 -紧 -党 -逞 -晒 -眠 -晓 -哮 -唠 -鸭 -晃 -哺 -晌 -剔 -晕 -蚌 -畔 -蚣 -蚊 -蚪 -蚓 -哨 -哩 -圃 -哭 -哦 -恩 -鸯 -唤 -唁 -哼 -唧 -啊 -唉 -唆 -罢 -峭 -峨 -峰 -圆 -峻 -贼 -贿 -赂 -赃 -钱 -钳 -钻 -钾 -铁 -铃 -铅 -缺 -氧 -氨 -特 -牺 -造 -乘 -敌 -秤 -租 -积 -秧 -秩 -称 -秘 -透 -笔 -笑 -笋 -债 -借 -值 -倚 -俺 -倾 -倒 -倘 -俱 -倡 -候 -赁 -俯 -倍 -倦 -健 -臭 -射 -躬 -息 -倔 -徒 -徐 -殷 -舰 -舱 -般 -航 -途 -拿 -耸 -爹 -舀 -爱 -豺 -豹 -颁 -颂 -翁 -胰 -脆 -脂 -胸 -胳 -脏 -脐 -胶 -脑 -脓 -逛 -狸 -狼 -卿 -逢 -鸵 -留 -鸳 -皱 -饿 -馁 -凌 -凄 -恋 -桨 -浆 -衰 -衷 -高 -郭 -席 -准 -座 -症 -病 -疾 -斋 -疹 -疼 -疲 -脊 -效 -离 -紊 -唐 -瓷 -资 -凉 -站 -剖 -竞 -部 -旁 -旅 -畜 -阅 -羞 -羔 -瓶 -拳 -粉 -料 -益 -兼 -烤 -烘 -烦 -烧 -烛 -烟 -烙 -递 -涛 -浙 -涝 -浦 -酒 -涉 -消 -涡 -浩 -海 -涂 -浴 -浮 -涣 -涤 -流 -润 -涧 -涕 -浪 -浸 -涨 -烫 -涩 -涌 -悖 -悟 -悄 -悍 -悔 -悯 -悦 -害 -宽 -家 -宵 -宴 -宾 -窍 -窄 -容 -宰 -案 -请 -朗 -诸 -诺 -读 -扇 -诽 -袜 -袖 -袍 -被 -祥 -课 -冥 -谁 -调 -冤 -谅 -谆 -谈 -谊 -剥 -恳 -展 -剧 -屑 -弱 -陵 -祟 -陶 -陷 -陪 -娱 -娟 -恕 -娥 -娘 -通 -能 -难 -预 -桑 -绢 -绣 -验 -继 -骏 -球 -琐 -理 -琉 -琅 -捧 -堵 -措 -描 -域 -捺 -掩 -捷 -排 -焉 -掉 -捶 -赦 -堆 -推 -埠 -掀 -授 -捻 -教 -掏 -掐 -掠 -掂 -培 -接 -掷 -控 -探 -据 -掘 -掺 -职 -基 -聆 -勘 -聊 -娶 -著 -菱 -勒 -黄 -菲 -萌 -萝 -菌 -萎 -菜 -萄 -菊 -菩 -萍 -菠 -萤 -营 -乾 -萧 -萨 -菇 -械 -彬 -梦 -婪 -梗 -梧 -梢 -梅 -检 -梳 -梯 -桶 -梭 -救 -曹 -副 -票 -酝 -酗 -厢 -戚 -硅 -硕 -奢 -盔 -爽 -聋 -袭 -盛 -匾 -雪 -辅 -辆 -颅 -虚 -彪 -雀 -堂 -常 -眶 -匙 -晨 -睁 -眯 -眼 -悬 -野 -啪 -啦 -曼 -晦 -晚 -啄 -啡 -距 -趾 -啃 -跃 -略 -蚯 -蛀 -蛇 -唬 -累 -鄂 -唱 -患 -啰 -唾 -唯 -啤 -啥 -啸 -崖 -崎 -崭 -逻 -崔 -帷 -崩 -崇 -崛 -婴 -圈 -铐 -铛 -铝 -铜 -铭 -铲 -银 -矫 -甜 -秸 -梨 -犁 -秽 -移 -笨 -笼 -笛 -笙 -符 -第 -敏 -做 -袋 -悠 -偿 -偶 -偎 -偷 -您 -售 -停 -偏 -躯 -兜 -假 -衅 -徘 -徙 -得 -衔 -盘 -舶 -船 -舵 -斜 -盒 -鸽 -敛 -悉 -欲 -彩 -领 -脚 -脖 -脯 -豚 -脸 -脱 -象 -够 -逸 -猜 -猪 -猎 -猫 -凰 -猖 -猛 -祭 -馅 -馆 -凑 -减 -毫 -烹 -庶 -麻 -庵 -痊 -痒 -痕 -廊 -康 -庸 -鹿 -盗 -章 -竟 -商 -族 -旋 -望 -率 -阎 -阐 -着 -羚 -盖 -眷 -粘 -粗 -粒 -断 -剪 -兽 -焊 -焕 -清 -添 -鸿 -淋 -涯 -淹 -渠 -渐 -淑 -淌 -混 -淮 -淆 -渊 -淫 -渔 -淘 -淳 -液 -淤 -淡 -淀 -深 -涮 -涵 -婆 -梁 -渗 -情 -惜 -惭 -悼 -惧 -惕 -惟 -惊 -惦 -悴 -惋 -惨 -惯 -寇 -寅 -寄 -寂 -宿 -窒 -窑 -密 -谋 -谍 -谎 -谐 -袱 -祷 -祸 -谓 -谚 -谜 -逮 -敢 -尉 -屠 -弹 -隋 -堕 -随 -蛋 -隅 -隆 -隐 -婚 -婶 -婉 -颇 -颈 -绩 -绪 -续 -骑 -绰 -绳 -维 -绵 -绷 -绸 -综 -绽 -绿 -缀 -巢 -琴 -琳 -琢 -琼 -斑 -替 -揍 -款 -堪 -塔 -搭 -堰 -揩 -越 -趁 -趋 -超 -揽 -堤 -提 -博 -揭 -喜 -彭 -揣 -插 -揪 -搜 -煮 -援 -搀 -裁 -搁 -搓 -搂 -搅 -壹 -握 -搔 -揉 -斯 -期 -欺 -联 -葫 -散 -惹 -葬 -募 -葛 -董 -葡 -敬 -葱 -蒋 -蒂 -落 -韩 -朝 -辜 -葵 -棒 -棱 -棋 -椰 -植 -森 -焚 -椅 -椒 -棵 -棍 -椎 -棉 -棚 -棕 -棺 -榔 -椭 -惠 -惑 -逼 -粟 -棘 -酣 -酥 -厨 -厦 -硬 -硝 -确 -硫 -雁 -殖 -裂 -雄 -颊 -雳 -暂 -雅 -翘 -辈 -悲 -紫 -凿 -辉 -敞 -棠 -赏 -掌 -晴 -睐 -暑 -最 -晰 -量 -鼎 -喷 -喳 -晶 -喇 -遇 -喊 -遏 -晾 -景 -畴 -践 -跋 -跌 -跑 -跛 -遗 -蛙 -蛛 -蜓 -蜒 -蛤 -喝 -鹃 -喂 -喘 -喉 -喻 -啼 -喧 -嵌 -幅 -帽 -赋 -赌 -赎 -赐 -赔 -黑 -铸 -铺 -链 -销 -锁 -锄 -锅 -锈 -锋 -锌 -锐 -甥 -掰 -短 -智 -氮 -毯 -氯 -鹅 -剩 -稍 -程 -稀 -税 -筐 -等 -筑 -策 -筛 -筒 -筏 -答 -筋 -筝 -傲 -傅 -牌 -堡 -集 -焦 -傍 -储 -皓 -皖 -粤 -奥 -街 -惩 -御 -循 -艇 -舒 -逾 -番 -释 -禽 -腊 -脾 -腋 -腔 -腕 -鲁 -猩 -猬 -猾 -猴 -惫 -然 -馈 -馋 -装 -蛮 -就 -敦 -斌 -痘 -痢 -痪 -痛 -童 -竣 -阔 -善 -翔 -羡 -普 -粪 -尊 -奠 -道 -遂 -曾 -焰 -港 -滞 -湖 -湘 -渣 -渤 -渺 -湿 -温 -渴 -溃 -溅 -滑 -湃 -渝 -湾 -渡 -游 -滋 -渲 -溉 -愤 -慌 -惰 -愕 -愣 -惶 -愧 -愉 -慨 -割 -寒 -富 -寓 -窜 -窝 -窖 -窗 -窘 -遍 -雇 -裕 -裤 -裙 -禅 -禄 -谢 -谣 -谤 -谦 -犀 -属 -屡 -强 -粥 -疏 -隔 -隙 -隘 -媒 -絮 -嫂 -媚 -婿 -登 -缅 -缆 -缉 -缎 -缓 -缔 -缕 -骗 -编 -骚 -缘 -瑟 -鹉 -瑞 -瑰 -瑙 -魂 -肆 -摄 -摸 -填 -搏 -塌 -鼓 -摆 -携 -搬 -摇 -搞 -塘 -摊 -聘 -斟 -蒜 -勤 -靴 -靶 -鹊 -蓝 -墓 -幕 -蓬 -蓄 -蒲 -蓉 -蒙 -蒸 -献 -椿 -禁 -楚 -楷 -榄 -想 -槐 -榆 -楼 -概 -赖 -酪 -酬 -感 -碍 -碘 -碑 -碎 -碰 -碗 -碌 -尴 -雷 -零 -雾 -雹 -辐 -辑 -输 -督 -频 -龄 -鉴 -睛 -睹 -睦 -瞄 -睫 -睡 -睬 -嗜 -鄙 -嗦 -愚 -暖 -盟 -歇 -暗 -暇 -照 -畸 -跨 -跷 -跳 -跺 -跪 -路 -跤 -跟 -遣 -蜈 -蜗 -蛾 -蜂 -蜕 -嗅 -嗡 -嗓 -署 -置 -罪 -罩 -蜀 -幌 -错 -锚 -锡 -锣 -锤 -锥 -锦 -键 -锯 -锰 -矮 -辞 -稚 -稠 -颓 -愁 -筹 -签 -简 -筷 -毁 -舅 -鼠 -催 -傻 -像 -躲 -魁 -衙 -微 -愈 -遥 -腻 -腰 -腥 -腮 -腹 -腺 -鹏 -腾 -腿 -鲍 -猿 -颖 -触 -解 -煞 -雏 -馍 -馏 -酱 -禀 -痹 -廓 -痴 -痰 -廉 -靖 -新 -韵 -意 -誊 -粮 -数 -煎 -塑 -慈 -煤 -煌 -满 -漠 -滇 -源 -滤 -滥 -滔 -溪 -溜 -漓 -滚 -溢 -溯 -滨 -溶 -溺 -粱 -滩 -慎 -誉 -塞 -寞 -窥 -窟 -寝 -谨 -褂 -裸 -福 -谬 -群 -殿 -辟 -障 -媳 -嫉 -嫌 -嫁 -叠 -缚 -缝 -缠 -缤 -剿 -静 -碧 -璃 -赘 -熬 -墙 -墟 -嘉 -摧 -赫 -截 -誓 -境 -摘 -摔 -撇 -聚 -慕 -暮 -摹 -蔓 -蔑 -蔡 -蔗 -蔽 -蔼 -熙 -蔚 -兢 -模 -槛 -榴 -榜 -榨 -榕 -歌 -遭 -酵 -酷 -酿 -酸 -碟 -碱 -碳 -磁 -愿 -需 -辖 -辗 -雌 -裳 -颗 -瞅 -墅 -嗽 -踊 -蜻 -蜡 -蝇 -蜘 -蝉 -嘛 -嘀 -赚 -锹 -锻 -镀 -舞 -舔 -稳 -熏 -箕 -算 -箩 -管 -箫 -舆 -僚 -僧 -鼻 -魄 -魅 -貌 -膜 -膊 -膀 -鲜 -疑 -孵 -馒 -裹 -敲 -豪 -膏 -遮 -腐 -瘩 -瘟 -瘦 -辣 -彰 -竭 -端 -旗 -精 -粹 -歉 -弊 -熄 -熔 -煽 -潇 -漆 -漱 -漂 -漫 -滴 -漾 -演 -漏 -慢 -慷 -寨 -赛 -寡 -察 -蜜 -寥 -谭 -肇 -褐 -褪 -谱 -隧 -嫩 -翠 -熊 -凳 -骡 -缩 -慧 -撵 -撕 -撒 -撩 -趣 -趟 -撑 -撮 -撬 -播 -擒 -墩 -撞 -撤 -增 -撰 -聪 -鞋 -鞍 -蕉 -蕊 -蔬 -蕴 -横 -槽 -樱 -橡 -樟 -橄 -敷 -豌 -飘 -醋 -醇 -醉 -磕 -磊 -磅 -碾 -震 -霄 -霉 -瞒 -题 -暴 -瞎 -嘻 -嘶 -嘲 -嘹 -影 -踢 -踏 -踩 -踪 -蝶 -蝴 -蝠 -蝎 -蝌 -蝗 -蝙 -嘿 -嘱 -幢 -墨 -镇 -镐 -镑 -靠 -稽 -稻 -黎 -稿 -稼 -箱 -篓 -箭 -篇 -僵 -躺 -僻 -德 -艘 -膝 -膛 -鲤 -鲫 -熟 -摩 -褒 -瘪 -瘤 -瘫 -凛 -颜 -毅 -糊 -遵 -憋 -潜 -澎 -潮 -潭 -鲨 -澳 -潘 -澈 -澜 -澄 -懂 -憔 -懊 -憎 -额 -翩 -褥 -谴 -鹤 -憨 -慰 -劈 -履 -豫 -缭 -撼 -擂 -操 -擅 -燕 -蕾 -薯 -薛 -薇 -擎 -薪 -薄 -颠 -翰 -噩 -橱 -橙 -橘 -整 -融 -瓢 -醒 -霍 -霎 -辙 -冀 -餐 -嘴 -踱 -蹄 -蹂 -蟆 -螃 -器 -噪 -鹦 -赠 -默 -黔 -镜 -赞 -穆 -篮 -篡 -篷 -篱 -儒 -邀 -衡 -膨 -雕 -鲸 -磨 -瘾 -瘸 -凝 -辨 -辩 -糙 -糖 -糕 -燃 -濒 -澡 -激 -懒 -憾 -懈 -窿 -壁 -避 -缰 -缴 -戴 -擦 -藉 -鞠 -藏 -藐 -檬 -檐 -檀 -礁 -磷 -霜 -霞 -瞭 -瞧 -瞬 -瞳 -瞩 -瞪 -曙 -蹋 -蹈 -螺 -蟋 -蟀 -嚎 -赡 -穗 -魏 -簧 -簇 -繁 -徽 -爵 -朦 -臊 -鳄 -癌 -辫 -赢 -糟 -糠 -燥 -懦 -豁 -臀 -臂 -翼 -骤 -藕 -鞭 -藤 -覆 -瞻 -蹦 -嚣 -镰 -翻 -鳍 -鹰 -瀑 -襟 -璧 -戳 -孽 -警 -蘑 -藻 -攀 -曝 -蹲 -蹭 -蹬 -巅 -簸 -簿 -蟹 -颤 -靡 -癣 -瓣 -羹 -鳖 -爆 -疆 -鬓 -壤 -馨 -耀 -躁 -蠕 -嚼 -嚷 -巍 -籍 -鳞 -魔 -糯 -灌 -譬 -蠢 -霸 -露 -霹 -躏 -黯 -髓 -赣 -囊 -镶 -瓤 -罐 -矗 -乂 -乜 -兀 -弋 -孑 -孓 -幺 -亓 -韦 -廿 -丏 -卅 -仄 -厄 -仃 -仉 -仂 -兮 -刈 -爻 -卞 -闩 -讣 -尹 -夬 -爿 -毋 -邗 -邛 -艽 -艿 -札 -叵 -匝 -丕 -匜 -劢 -卟 -叱 -叻 -仨 -仕 -仟 -仡 -仫 -仞 -卮 -氐 -犰 -刍 -邝 -邙 -汀 -讦 -讧 -讪 -讫 -尻 -阡 -尕 -弁 -驭 -匡 -耒 -玎 -玑 -邢 -圩 -圬 -圭 -扦 -圪 -圳 -圹 -扪 -圮 -圯 -芊 -芍 -芄 -芨 -芑 -芎 -芗 -亘 -厍 -夼 -戍 -尥 -乩 -旯 -曳 -岌 -屺 -凼 -囡 -钇 -缶 -氘 -氖 -牝 -伎 -伛 -伢 -佤 -仵 -伥 -伧 -伉 -伫 -囟 -汆 -刖 -夙 -旮 -刎 -犷 -犸 -舛 -凫 -邬 -饧 -汕 -汔 -汐 -汲 -汜 -汊 -忖 -忏 -讴 -讵 -祁 -讷 -聿 -艮 -厾 -阱 -阮 -阪 -丞 -妁 -牟 -纡 -纣 -纥 -纨 -玕 -玙 -抟 -抔 -圻 -坂 -坍 -坞 -抃 -抉 -㧐 -芫 -邯 -芸 -芾 -苈 -苣 -芷 -芮 -苋 -芼 -苌 -苁 -芩 -芪 -芡 -芟 -苄 -苎 -苡 -杌 -杓 -杞 -杈 -忑 -孛 -邴 -邳 -矶 -奁 -豕 -忒 -欤 -轫 -迓 -邶 -忐 -卣 -邺 -旰 -呋 -呒 -呓 -呔 -呖 -呃 -旸 -吡 -町 -虬 -呗 -吽 -吣 -吲 -帏 -岐 -岈 -岘 -岑 -岚 -兕 -囵 -囫 -钊 -钋 -钌 -迕 -氙 -氚 -牤 -佞 -邱 -攸 -佚 -佝 -佟 -佗 -伽 -彷 -佘 -佥 -孚 -豸 -坌 -肟 -邸 -奂 -劬 -狄 -狁 -鸠 -邹 -饨 -饩 -饪 -饫 -饬 -亨 -庑 -庋 -疔 -疖 -肓 -闱 -闳 -闵 -羌 -炀 -沣 -沅 -沔 -沤 -沌 -沏 -沚 -汩 -汨 -沂 -汾 -沨 -汴 -汶 -沆 -沩 -泐 -怃 -怄 -忡 -忤 -忾 -怅 -忻 -忪 -怆 -忭 -忸 -诂 -诃 -诅 -诋 -诌 -诏 -诒 -孜 -陇 -陀 -陂 -陉 -妍 -妩 -妪 -妣 -妊 -妗 -妫 -妞 -姒 -妤 -邵 -劭 -刭 -甬 -邰 -纭 -纰 -纴 -纶 -纾 -玮 -玡 -玭 -玠 -玢 -玥 -玦 -盂 -忝 -匦 -坩 -抨 -拤 -坫 -拈 -垆 -抻 -劼 -拃 -拊 -坼 -坻 -㧟 -坨 -坭 -抿 -坳 -耶 -苷 -苯 -苤 -茏 -苫 -苜 -苴 -苒 -苘 -茌 -苻 -苓 -茚 -茆 -茑 -茓 -茔 -茕 -茀 -苕 -枥 -枇 -杪 -杳 -枧 -杵 -枨 -枞 -枋 -杻 -杷 -杼 -矸 -砀 -刳 -奄 -瓯 -殁 -郏 -轭 -郅 -鸢 -盱 -昊 -昙 -杲 -昃 -咂 -呸 -昕 -昀 -旻 -昉 -炅 -咔 -畀 -虮 -咀 -呷 -黾 -呱 -呤 -咚 -咆 -咛 -呶 -呣 -呦 -咝 -岢 -岿 -岬 -岫 -帙 -岣 -峁 -刿 -迥 -岷 -剀 -帔 -峄 -沓 -囹 -罔 -钍 -钎 -钏 -钒 -钕 -钗 -邾 -迮 -牦 -竺 -迤 -佶 -佬 -佰 -侑 -侉 -臾 -岱 -侗 -侃 -侏 -侩 -佻 -佾 -侪 -佼 -佯 -侬 -帛 -阜 -侔 -徂 -刽 -郄 -怂 -籴 -瓮 -戗 -肼 -䏝 -肽 -肱 -肫 -剁 -迩 -郇 -狙 -狎 -狍 -狒 -咎 -炙 -枭 -饯 -饴 -冽 -冼 -庖 -疠 -疝 -疡 -兖 -妾 -劾 -炜 -𬉼 -炖 -炘 -炝 -炔 -泔 -沭 -泷 -泸 -泱 -泅 -泗 -泠 -泺 -泖 -泫 -泮 -沱 -泯 -泓 -泾 -怙 -怵 -怦 -怛 -怏 -怍 -㤘 -怩 -怫 -怿 -宕 -穹 -宓 -诓 -诔 -诖 -诘 -戾 -诙 -戽 -郓 -衩 -祆 -祎 -祉 -祇 -诛 -诜 -诟 -诠 -诣 -诤 -诧 -诨 -诩 -戕 -孢 -亟 -陔 -妲 -妯 -姗 -帑 -弩 -孥 -驽 -虱 -迦 -迨 -绀 -绁 -绂 -驷 -驸 -绉 -绌 -驿 -骀 -甾 -珏 -珐 -珂 -珑 -玳 -珀 -顸 -珉 -珈 -拮 -垭 -挝 -垣 -挞 -垤 -赳 -贲 -垱 -垌 -郝 -垧 -垓 -挦 -垠 -茜 -荚 -荑 -贳 -荜 -莒 -茼 -茴 -茱 -莛 -荞 -茯 -荏 -荇 -荃 -荟 -荀 -茗 -荠 -茭 -茨 -垩 -荥 -荦 -荨 -荩 -剋 -荪 -茹 -荬 -荮 -柰 -栉 -柯 -柘 -栊 -柩 -枰 -栌 -柙 -枵 -柚 -枳 -柞 -柝 -栀 -柢 -栎 -枸 -柈 -柁 -枷 -柽 -剌 -酊 -郦 -甭 -砗 -砘 -砒 -斫 -砭 -砜 -奎 -耷 -虺 -殂 -殇 -殄 -殆 -轱 -轲 -轳 -轶 -轸 -虿 -毖 -觇 -尜 -哐 -眄 -眍 -𠳐 -郢 -眇 -眊 -眈 -禺 -哂 -咴 -曷 -昴 -昱 -昵 -咦 -哓 -哔 -畎 -毗 -呲 -胄 -畋 -畈 -虼 -虻 -盅 -咣 -哕 -剐 -郧 -咻 -囿 -咿 -哌 -哙 -哚 -咯 -咩 -咤 -哝 -哏 -哞 -峙 -峣 -罘 -帧 -峒 -峤 -峋 -峥 -贶 -钚 -钛 -钡 -钣 -钤 -钨 -钫 -钯 -氡 -氟 -牯 -郜 -秕 -秭 -竽 -笈 -笃 -俦 -俨 -俅 -俪 -叟 -垡 -牮 -俣 -俚 -皈 -俑 -俟 -逅 -徇 -徉 -舢 -俞 -郗 -俎 -郤 -爰 -郛 -瓴 -胨 -胪 -胛 -胂 -胙 -胍 -胗 -胝 -朐 -胫 -鸨 -匍 -狨 -狯 -飑 -狩 -狲 -訇 -逄 -昝 -饷 -饸 -饹 -胤 -孪 -娈 -弈 -奕 -庥 -疬 -疣 -疥 -疭 -庠 -竑 -彦 -飒 -闼 -闾 -闿 -阂 -羑 -迸 -籼 -酋 -炳 -炻 -炽 -炯 -烀 -炷 -烃 -洱 -洹 -洧 -洌 -浃 -洇 -洄 -洙 -涎 -洎 -洫 -浍 -洮 -洵 -浒 -浔 -浕 -洳 -恸 -恓 -恹 -恫 -恺 -恻 -恂 -恪 -恽 -宥 -扃 -衲 -衽 -衿 -袂 -祛 -祜 -祓 -祚 -诮 -祗 -祢 -诰 -诳 -鸩 -昶 -郡 -咫 -弭 -牁 -胥 -陛 -陟 -娅 -姮 -娆 -姝 -姣 -姘 -姹 -怼 -羿 -炱 -矜 -绔 -骁 -骅 -绗 -绛 -骈 -耖 -挈 -珥 -珙 -顼 -珰 -珩 -珧 -珣 -珞 -琤 -珲 -敖 -恚 -埔 -埕 -埘 -埙 -埚 -挹 -耆 -耄 -埒 -捋 -贽 -垸 -捃 -盍 -荸 -莆 -莳 -莴 -莪 -莠 -莓 -莜 -莅 -荼 -莩 -荽 -莸 -荻 -莘 -莎 -莞 -莨 -渇 -鸪 -莼 -栲 -栳 -郴 -桓 -桡 -桎 -桢 -桤 -梃 -栝 -桕 -桁 -桧 -桅 -栟 -桉 -栩 -逑 -逋 -彧 -鬲 -豇 -酐 -逦 -厝 -孬 -砝 -砹 -砺 -砧 -砷 -砟 -砼 -砥 -砣 -剞 -砻 -轼 -轾 -辂 -鸫 -趸 -龀 -鸬 -虔 -逍 -眬 -唛 -晟 -眩 -眙 -哧 -哽 -唔 -晁 -晏 -鸮 -趵 -趿 -畛 -蚨 -蚜 -蚍 -蚋 -蚬 -蚝 -蚧 -唢 -圄 -唣 -唏 -盎 -唑 -崂 -崃 -罡 -罟 -峪 -觊 -赅 -钰 -钲 -钴 -钵 -钹 -钺 -钽 -钼 -钿 -铀 -铂 -铄 -铆 -铈 -铉 -铊 -铋 -铌 -铍 -䥽 -铎 -氩 -氤 -氦 -毪 -舐 -秣 -秫 -盉 -笄 -笕 -笊 -笏 -笆 -俸 -倩 -俵 -偌 -俳 -俶 -倬 -倏 -恁 -倭 -倪 -俾 -倜 -隼 -隽 -倌 -倥 -臬 -皋 -郫 -倨 -衄 -颀 -徕 -舫 -釜 -奚 -衾 -胯 -胱 -胴 -胭 -脍 -胼 -朕 -脒 -胺 -鸱 -玺 -鸲 -狷 -猁 -狳 -猃 -狺 -逖 -桀 -袅 -饽 -凇 -栾 -挛 -亳 -疳 -疴 -疸 -疽 -痈 -疱 -痂 -痉 -衮 -凋 -颃 -恣 -旆 -旄 -旃 -阃 -阄 -訚 -阆 -恙 -粑 -朔 -郸 -烜 -烨 -烩 -烊 -剡 -郯 -烬 -涑 -浯 -涞 -涟 -娑 -涅 -涠 -浞 -涓 -浥 -涔 -浜 -浠 -浣 -浚 -悚 -悭 -悝 -悒 -悌 -悛 -宸 -窈 -剜 -诹 -冢 -诼 -袒 -袢 -祯 -诿 -谀 -谂 -谄 -谇 -屐 -屙 -陬 -勐 -奘 -牂 -蚩 -陲 -姬 -娠 -娌 -娉 -娲 -娩 -娴 -娣 -娓 -婀 -畚 -逡 -绠 -骊 -绡 -骋 -绥 -绦 -绨 -骎 -邕 -鸶 -彗 -耜 -焘 -舂 -琏 -琇 -麸 -揶 -埴 -埯 -捯 -掳 -掴 -埸 -埵 -赧 -埤 -捭 -逵 -埝 -堋 -堍 -掬 -鸷 -掖 -捽 -掊 -堉 -掸 -捩 -掮 -悫 -埭 -埽 -掇 -掼 -聃 -菁 -萁 -菘 -堇 -萘 -萋 -菽 -菖 -萜 -萸 -萑 -棻 -菔 -菟 -萏 -萃 -菏 -菹 -菪 -菅 -菀 -萦 -菰 -菡 -梵 -梿 -梏 -觋 -桴 -桷 -梓 -棁 -桫 -棂 -啬 -郾 -匮 -敕 -豉 -鄄 -酞 -酚 -戛 -硎 -硭 -硒 -硖 -硗 -硐 -硇 -硌 -鸸 -瓠 -匏 -厩 -龚 -殒 -殓 -殍 -赉 -雩 -辄 -堑 -眭 -眦 -啧 -晡 -晤 -眺 -眵 -眸 -圊 -喏 -喵 -啉 -勖 -晞 -唵 -晗 -冕 -啭 -畦 -趺 -啮 -跄 -蚶 -蛄 -蛎 -蛆 -蚰 -蛊 -圉 -蚱 -蛉 -蛏 -蚴 -啁 -啕 -唿 -啐 -唼 -唷 -啖 -啵 -啶 -啷 -唳 -唰 -啜 -帻 -崚 -崦 -帼 -崮 -崤 -崆 -赇 -赈 -赊 -铑 -铒 -铗 -铙 -铟 -铠 -铡 -铢 -铣 -铤 -铧 -铨 -铩 -铪 -铫 -铬 -铮 -铯 -铰 -铱 -铳 -铵 -铷 -氪 -牾 -鸹 -秾 -逶 -笺 -筇 -笸 -笪 -笮 -笠 -笥 -笤 -笳 -笾 -笞 -偾 -偃 -偕 -偈 -傀 -偬 -偻 -皑 -皎 -鸻 -徜 -舸 -舻 -舴 -舷 -龛 -翎 -脬 -脘 -脲 -匐 -猗 -猡 -猞 -猝 -斛 -猕 -馗 -馃 -馄 -鸾 -孰 -庹 -庾 -痔 -痍 -疵 -翊 -旌 -旎 -袤 -阇 -阈 -阉 -阊 -阋 -阍 -阏 -羟 -粝 -粕 -敝 -焐 -烯 -焓 -烽 -焖 -烷 -焗 -渍 -渚 -淇 -淅 -淞 -渎 -涿 -淖 -挲 -淠 -涸 -渑 -淦 -淝 -淬 -涪 -淙 -涫 -渌 -淄 -惬 -悻 -悱 -惝 -惘 -悸 -惆 -惚 -惇 -惮 -窕 -谌 -谏 -扈 -皲 -谑 -裆 -袷 -裉 -谒 -谔 -谕 -谖 -谗 -谙 -谛 -谝 -逯 -郿 -隈 -粜 -隍 -隗 -婧 -婊 -婕 -娼 -婢 -婵 -胬 -袈 -翌 -恿 -欸 -绫 -骐 -绮 -绯 -绱 -骒 -绲 -骓 -绶 -绺 -绻 -绾 -骖 -缁 -耠 -琫 -琵 -琶 -琪 -瑛 -琦 -琥 -琨 -靓 -琰 -琮 -琯 -琬 -琛 -琚 -辇 -鼋 -揳 -堞 -搽 -揸 -揠 -堙 -趄 -揖 -颉 -塄 -揿 -耋 -揄 -蛩 -蛰 -塆 -摒 -揆 -掾 -聒 -葑 -葚 -靰 -靸 -葳 -葺 -葸 -萼 -葆 -葩 -葶 -蒌 -萱 -戟 -葭 -楮 -棼 -椟 -棹 -椤 -棰 -赍 -椋 -椁 -椪 -棣 -椐 -鹁 -覃 -酤 -酢 -酡 -鹂 -厥 -殚 -殛 -雯 -雱 -辊 -辋 -椠 -辍 -辎 -斐 -睄 -睑 -睇 -睃 -戢 -喋 -嗒 -喃 -喱 -喹 -晷 -喈 -跖 -跗 -跞 -跚 -跎 -跏 -跆 -蛱 -蛲 -蛭 -蛳 -蛐 -蛔 -蛞 -蛴 -蛟 -蛘 -喁 -喟 -啾 -嗖 -喑 -嗟 -喽 -嗞 -喀 -喔 -喙 -嵘 -嵖 -崴 -遄 -詈 -嵎 -崽 -嵬 -嵛 -嵯 -嵝 -嵫 -幄 -嵋 -赕 -铻 -铼 -铿 -锃 -锂 -锆 -锇 -锉 -锏 -锑 -锒 -锔 -锕 -掣 -矬 -氰 -毳 -毽 -犊 -犄 -犋 -鹄 -犍 -嵇 -黍 -稃 -稂 -筚 -筵 -筌 -傣 -傈 -舄 -牍 -傥 -傧 -遑 -傩 -遁 -徨 -媭 -畲 -弑 -颌 -翕 -釉 -鹆 -舜 -貂 -腈 -腌 -腓 -腆 -腴 -腑 -腚 -腱 -鱿 -鲀 -鲂 -颍 -猢 -猹 -猥 -飓 -觞 -觚 -猱 -颎 -飧 -馇 -馊 -亵 -脔 -裒 -痣 -痨 -痦 -痞 -痤 -痫 -痧 -赓 -竦 -瓿 -啻 -颏 -鹇 -阑 -阒 -阕 -粞 -遒 -孳 -焯 -焜 -焙 -焱 -鹈 -湛 -渫 -湮 -湎 -湜 -渭 -湍 -湫 -溲 -湟 -溆 -湲 -湔 -湉 -渥 -湄 -滁 -愠 -惺 -愦 -惴 -愀 -愎 -愔 -喾 -寐 -谟 -扉 -裢 -裎 -裥 -祾 -祺 -谠 -幂 -谡 -谥 -谧 -遐 -孱 -弼 -巽 -骘 -媪 -媛 -婷 -巯 -翚 -皴 -婺 -骛 -缂 -缃 -缄 -彘 -缇 -缈 -缌 -缑 -缒 -缗 -飨 -耢 -瑚 -瑁 -瑜 -瑗 -瑄 -瑕 -遨 -骜 -韫 -髡 -塬 -鄢 -趔 -趑 -摅 -摁 -蜇 -搋 -搪 -搐 -搛 -搠 -摈 -彀 -毂 -搦 -搡 -蓁 -戡 -蓍 -鄞 -靳 -蓐 -蓦 -鹋 -蒽 -蓓 -蓖 -蓊 -蒯 -蓟 -蓑 -蒿 -蒺 -蓠 -蒟 -蒡 -蒹 -蒴 -蒗 -蓥 -颐 -楔 -楠 -楂 -楝 -楫 -楸 -椴 -槌 -楯 -皙 -榈 -槎 -榉 -楦 -楣 -楹 -椽 -裘 -剽 -甄 -酮 -酰 -酯 -酩 -蜃 -碛 -碓 -硼 -碉 -碚 -碇 -碜 -鹌 -辏 -龃 -龅 -訾 -粲 -虞 -睚 -嗪 -韪 -嗷 -嗉 -睨 -睢 -雎 -睥 -嘟 -嗑 -嗫 -嗬 -嗔 -嗝 -戥 -嗄 -煦 -暄 -遢 -暌 -跬 -跶 -跸 -跐 -跣 -跹 -跻 -蛸 -蜊 -蜍 -蜉 -蜣 -畹 -蛹 -嗣 -嗯 -嗥 -嗲 -嗳 -嗌 -嗍 -嗨 -嗐 -嗤 -嗵 -罨 -嵊 -嵩 -嵴 -骰 -锗 -锛 -锜 -锝 -锞 -锟 -锢 -锨 -锩 -锭 -锱 -雉 -氲 -犏 -歃 -稞 -稗 -稔 -筠 -筢 -筮 -筲 -筱 -牒 -煲 -敫 -徭 -愆 -艄 -觎 -毹 -貊 -貅 -貉 -颔 -腠 -腩 -腼 -腭 -腧 -塍 -媵 -詹 -鲅 -鲆 -鲇 -鲈 -稣 -鲋 -鲐 -肄 -鹐 -飕 -觥 -遛 -馐 -鹑 -亶 -瘃 -痱 -痼 -痿 -瘐 -瘁 -瘆 -麂 -裔 -歆 -旒 -雍 -阖 -阗 -阙 -羧 -豢 -粳 -猷 -煳 -煜 -煨 -煅 -煊 -煸 -煺 -滟 -溱 -溘 -漭 -滢 -溥 -溧 -溽 -裟 -溻 -溷 -滗 -滫 -溴 -滏 -滃 -滦 -溏 -滂 -滓 -溟 -滪 -愫 -慑 -慊 -鲎 -骞 -窦 -窠 -窣 -裱 -褚 -裨 -裾 -裰 -禊 -谩 -谪 -媾 -嫫 -媲 -嫒 -嫔 -媸 -缙 -缜 -缛 -辔 -骝 -缟 -缡 -缢 -缣 -骟 -耥 -璈 -瑶 -瑭 -獒 -觏 -慝 -嫠 -韬 -叆 -髦 -摽 -墁 -撂 -摞 -撄 -翥 -踅 -摭 -墉 -墒 -榖 -綦 -蔫 -蔷 -靺 -靼 -鞅 -靿 -甍 -蔸 -蔟 -蔺 -戬 -蕖 -蔻 -蓿 -斡 -鹕 -蓼 -榛 -榧 -榻 -榫 -榭 -槔 -榱 -槁 -槟 -槠 -榷 -僰 -酽 -酶 -酹 -厮 -碡 -碴 -碣 -碲 -磋 -臧 -豨 -殡 -霆 -霁 -辕 -蜚 -裴 -翡 -龇 -龈 -睿 -䁖 -睽 -嘞 -嘈 -嘌 -嘁 -嘎 -暧 -暝 -踌 -踉 -蜞 -蜥 -蜮 -蝈 -蜴 -蜱 -蜩 -蜷 -蜿 -螂 -蜢 -嘘 -嘡 -鹗 -嘣 -嘤 -嘚 -嗾 -嘧 -罴 -罱 -幔 -嶂 -幛 -赙 -罂 -骷 -骶 -鹘 -锲 -锴 -锶 -锷 -锸 -锵 -镁 -镂 -犒 -箐 -箦 -箧 -箍 -箸 -箬 -箅 -箪 -箔 -箜 -箢 -箓 -毓 -僖 -儆 -僳 -僭 -劁 -僮 -魃 -魆 -睾 -艋 -鄱 -膈 -膑 -鲑 -鲔 -鲚 -鲛 -鲟 -獐 -觫 -雒 -夤 -馑 -銮 -塾 -麽 -瘌 -瘊 -瘘 -瘙 -廖 -韶 -旖 -膂 -阚 -鄯 -鲞 -粿 -粼 -粽 -糁 -槊 -鹚 -熘 -熥 -潢 -漕 -滹 -漯 -漶 -潋 -潴 -漪 -漉 -漳 -漩 -澉 -潍 -慵 -搴 -窨 -寤 -綮 -谮 -褡 -褙 -褓 -褛 -褊 -谯 -谰 -谲 -暨 -屣 -鹛 -嫣 -嫱 -嫖 -嫦 -嫚 -嫘 -嫡 -鼐 -翟 -瞀 -鹜 -骠 -缥 -缦 -缧 -缨 -骢 -缪 -缫 -耦 -耧 -瑾 -璜 -璀 -璎 -璁 -璋 -璇 -奭 -髯 -髫 -撷 -撅 -赭 -撸 -鋆 -撙 -撺 -墀 -聩 -觐 -鞑 -蕙 -鞒 -蕈 -蕨 -蕤 -蕞 -蕺 -瞢 -蕃 -蕲 -赜 -槿 -樯 -槭 -樗 -樘 -樊 -槲 -醌 -醅 -靥 -魇 -餍 -磔 -磙 -霈 -辘 -龉 -龊 -觑 -瞌 -瞋 -瞑 -嘭 -噎 -噶 -颙 -暹 -噘 -踔 -踝 -踟 -踒 -踬 -踮 -踯 -踺 -踞 -蝽 -蝾 -蝻 -蝰 -蝮 -螋 -蝓 -蝣 -蝼 -噗 -嘬 -颚 -噍 -噢 -噙 -噜 -噌 -噔 -颛 -幞 -幡 -嶙 -嶝 -骺 -骼 -骸 -镊 -镉 -镌 -镍 -镏 -镒 -镓 -镔 -稷 -箴 -篑 -篁 -篌 -篆 -牖 -儋 -徵 -磐 -虢 -鹞 -膘 -滕 -鲠 -鲡 -鲢 -鲣 -鲥 -鲧 -鲩 -獗 -獠 -觯 -馓 -馔 -麾 -廛 -瘛 -瘼 -瘢 -瘠 -齑 -羯 -羰 -𥻗 -遴 -糌 -糍 -糅 -熜 -熵 -熠 -澍 -澌 -潸 -潦 -潲 -鋈 -潟 -潼 -潺 -憬 -憧 -寮 -窳 -谳 -褴 -褟 -褫 -谵 -熨 -屦 -嬉 -勰 -戮 -蝥 -缬 -缮 -缯 -骣 -畿 -耩 -耨 -耪 -璞 -璟 -靛 -璠 -璘 -聱 -螯 -髻 -髭 -髹 -擀 -熹 -甏 -擞 -縠 -磬 -颞 -蕻 -鞘 -颟 -薤 -薨 -檠 -薏 -薮 -薜 -薅 -樾 -橛 -橇 -樵 -檎 -橹 -樽 -樨 -橼 -墼 -橐 -翮 -醛 -醐 -醍 -醚 -磲 -赝 -飙 -殪 -霖 -霏 -霓 -錾 -辚 -臻 -遽 -氅 -瞟 -瞠 -瞰 -嚄 -嚆 -噤 -暾 -蹀 -踹 -踵 -踽 -蹉 -蹁 -螨 -蟒 -螈 -螅 -螭 -螠 -螟 -噱 -噬 -噫 -噻 -噼 -罹 -圜 -䦃 -镖 -镗 -镘 -镚 -镛 -镝 -镞 -镠 -氇 -氆 -憩 -穑 -篝 -篥 -篦 -篪 -篙 -盥 -劓 -翱 -魉 -魈 -徼 -歙 -膳 -膦 -膙 -鲮 -鲱 -鲲 -鲳 -鲴 -鲵 -鲷 -鲻 -獴 -獭 -獬 -邂 -鹧 -廨 -赟 -瘰 -廪 -瘿 -瘵 -瘴 -癃 -瘳 -斓 -麇 -麈 -嬴 -壅 -羲 -糗 -瞥 -甑 -燎 -燠 -燔 -燧 -濑 -濉 -潞 -澧 -澹 -澥 -澶 -濂 -褰 -寰 -窸 -褶 -禧 -嬖 -犟 -隰 -嬗 -颡 -缱 -缲 -缳 -璨 -璩 -璐 -璪 -螫 -擤 -壕 -觳 -罄 -擢 -薹 -鞡 -鞬 -薷 -薰 -藓 -藁 -檄 -檩 -懋 -醢 -翳 -礅 -磴 -鹩 -龋 -龌 -豳 -壑 -黻 -嚏 -嚅 -蹑 -蹒 -蹊 -蟥 -螬 -螵 -疃 -螳 -蟑 -嚓 -羁 -罽 -罾 -嶷 -黜 -黝 -髁 -髀 -镡 -镢 -镣 -镦 -镧 -镩 -镪 -镫 -罅 -黏 -簌 -篾 -篼 -簖 -簋 -鼢 -黛 -儡 -鹪 -鼾 -皤 -魍 -龠 -繇 -貘 -邈 -貔 -臌 -膻 -臆 -臃 -鲼 -鲽 -鳀 -鳃 -鳅 -鳇 -鳊 -螽 -燮 -鹫 -襄 -糜 -縻 -膺 -癍 -麋 -懑 -濡 -濮 -濞 -濠 -濯 -蹇 -謇 -邃 -襁 -檗 -擘 -孺 -隳 -嬷 -蟊 -鹬 -鍪 -鏊 -鳌 -鬈 -鬃 -瞽 -鞯 -鞨 -鞫 -鞧 -鞣 -藜 -藠 -藩 -醪 -蹙 -礓 -燹 -餮 -瞿 -曛 -颢 -曜 -躇 -蹚 -鹭 -蟛 -蟪 -蟠 -蟮 -鹮 -黠 -黟 -髅 -髂 -镬 -镭 -镯 -馥 -簟 -簪 -鼬 -雠 -艟 -鳎 -鳏 -鳐 -癞 -癔 -癜 -癖 -糨 -蹩 -鎏 -懵 -彝 -邋 -鬏 -攉 -攒 -鞲 -鞴 -藿 -蘧 -蘅 -麓 -醮 -醯 -酃 -霪 -霭 -霨 -黼 -嚯 -蹰 -蹶 -蹽 -蹼 -蹴 -蹾 -蹿 -蠖 -蠓 -蟾 -蠊 -黢 -髋 -髌 -镲 -籀 -籁 -齁 -魑 -艨 -鳓 -鳔 -鳕 -鳗 -鳙 -麒 -鏖 -羸 -㸆 -瀚 -瀣 -瀛 -襦 -谶 -襞 -骥 -缵 -瓒 -攘 -蘩 -蘖 -醴 -霰 -酆 -矍 -曦 -躅 -鼍 -巉 -黩 -黥 -黪 -镳 -镴 -黧 -纂 -璺 -鼯 -臜 -鳜 -鳝 -鳟 -獾 -孀 -骧 -瓘 -鼙 -醺 -礴 -颦 -曩 -鳢 -癫 -麝 -夔 -爝 -灏 -禳 -鐾 -羼 -蠡 -耱 -懿 -蘸 -鹳 -霾 -氍 -饕 -躐 -髑 -镵 -穰 -饔 -鬻 -鬟 -趱 -攫 -攥 -颧 -躜 -鼹 -癯 -麟 -蠲 -蠹 -躞 -衢 -鑫 -灞 -襻 -纛 -鬣 -攮 -囔 -馕 -戆 -爨 -齉 -亍 -尢 -彳 -卬 -殳 -𠙶 -毌 -邘 -戋 -圢 -氕 -伋 -仝 -冮 -氿 -汈 -氾 -忉 -宄 -讱 -扞 -圲 -圫 -芏 -芃 -朳 -朸 -𨙸 -邨 -吒 -吖 -屼 -屾 -辿 -钆 -仳 -伣 -伈 -癿 -甪 -邠 -犴 -冱 -邡 -闫 -汋 -䜣 -讻 -孖 -纩 -玒 -玓 -玘 -玚 -刬 -坜 -坉 -扽 -坋 -扺 -㧑 -毐 -芰 -芣 -苊 -苉 -芘 -芴 -芠 -芤 -杕 -杙 -杄 -杧 -杩 -尪 -尨 -轪 -坒 -芈 -旴 -旵 -呙 -㕮 -岍 -岠 -岜 -呇 -冏 -觃 -岙 -伾 -㑇 -伭 -佖 -伲 -佁 -飏 -狃 -闶 -汧 -汫 -𣲘 -𣲗 -沄 -沘 -汭 -㳇 -沇 -忮 -忳 -忺 -祃 -诇 -邲 -诎 -诐 -屃 -岊 -阽 -䢺 -阼 -妧 -妘 -𨚕 -纮 -驲 -纻 -纼 -玤 -玞 -玱 -玟 -邽 -邿 -坥 -坰 -坬 -坽 -弆 -耵 -䢼 -𦭜 -茋 -苧 -苾 -苠 -枅 -㭎 -枘 -枍 -矼 -矻 -匼 -旿 -昇 -昄 -昒 -昈 -咉 -咇 -咍 -岵 -岽 -岨 -岞 -峂 -㟃 -囷 -钐 -钔 -钖 -牥 -佴 -垈 -侁 -侹 -佸 -佺 -隹 -㑊 -侂 -佽 -侘 -郈 -舠 -郐 -郃 -攽 -肭 -肸 -肷 -狉 -狝 -饳 -忞 -於 -炌 -炆 -泙 -沺 -泂 -泜 -泃 -泇 -怊 -峃 -穸 -祋 -祊 -鸤 -弢 -弨 -陑 -陎 -卺 -乸 -妭 -姈 -迳 -叕 -驵 -䌹 -驺 -绋 -绐 -砉 -耔 -㛃 -玶 -珇 -珅 -珋 -玹 -珌 -玿 -韨 -垚 -垯 -垙 -垲 -埏 -垍 -耇 -垎 -垴 -垟 -垞 -挓 -垵 -垏 -拶 -荖 -荁 -荙 -荛 -茈 -茽 -荄 -茺 -荓 -茳 -𦰡 -茛 -荭 -㭕 -柷 -柃 -柊 -枹 -栐 -柖 -郚 -剅 -䴓 -迺 -厖 -砆 -砑 -砄 -耏 -奓 -䶮 -轵 -轷 -轹 -轺 -昺 -昽 -盷 -咡 -咺 -昳 -昣 -哒 -昤 -昫 -昡 -咥 -昪 -虷 -虸 -哃 -峘 -耑 -峛 -峗 -峧 -帡 -钘 -钜 -钪 -钬 -钭 -矧 -秬 -俫 -舁 -俜 -俙 -俍 -垕 -衎 -舣 -弇 -侴 -鸧 -䏡 -胠 -𦙶 -胈 -胩 -胣 -朏 -飐 -訄 -饻 -庤 -疢 -炣 -炟 -㶲 -洭 -洘 -洓 -洿 -㳚 -泚 -浈 -浉 -洸 -洑 -洢 -洈 -洚 -洺 -洨 -浐 -㳘 -洴 -洣 -恔 -宬 -窀 -扂 -袆 -祏 -祐 -祕 -叚 -陧 -陞 -娀 -姞 -姱 -姤 -姶 -姽 -枲 -绖 -骃 -彖 -骉 -恝 -珪 -珛 -珹 -琊 -玼 -珖 -珽 -珦 -珫 -珒 -珢 -珕 -珝 -埗 -垾 -垺 -埆 -垿 -埌 -埇 -莰 -茝 -鄀 -莶 -莝 -䓖 -莙 -栻 -桠 -桄 -梠 -栴 -梴 -栒 -酎 -酏 -砵 -砠 -砫 -砬 -硁 -恧 -翃 -郪 -𨐈 -辀 -辁 -剕 -赀 -哢 -晅 -晊 -唝 -哳 -哱 -冔 -晔 -晐 -晖 -畖 -蚄 -蚆 -帱 -崁 -峿 -崄 -帨 -崀 -赆 -钷 -眚 -甡 -笫 -倻 -倴 -脩 -倮 -倕 -倞 -倓 -倧 -衃 -虒 -舭 -舯 -舥 -瓞 -鬯 -鸰 -脎 -朓 -胲 -虓 -鱽 -狴 -峱 -狻 -眢 -勍 -痄 -疰 -痃 -竘 -羖 -羓 -桊 -敉 -烠 -烔 -烶 -烻 -涍 -浡 -浭 -浬 -涄 -涢 -涐 -浰 -浟 -浛 -浼 -浲 -涘 -悈 -悃 -悢 -宧 -窅 -窊 -窎 -扅 -扆 -袪 -袗 -袯 -祧 -隺 -堲 -疍 -𨺙 -陴 -烝 -砮 -㛚 -哿 -翀 -翂 -剟 -绤 -骍 -䂮 -琎 -珸 -珵 -琄 -琈 -琀 -珺 -掭 -堎 -堐 -埼 -掎 -埫 -堌 -晢 -掞 -埪 -壸 -㙍 -聍 -菝 -萚 -菥 -莿 -䓫 -勚 -䓬 -萆 -菂 -菍 -菼 -萣 -䓨 -菉 -䓛 -梼 -梽 -桲 -梾 -桯 -梣 -梌 -桹 -敔 -厣 -硔 -硙 -硚 -硊 -硍 -勔 -䴕 -龁 -逴 -唪 -啫 -翈 -㫰 -晙 -畤 -趼 -跂 -蛃 -蚲 -蚺 -啴 -䎃 -崧 -崟 -崞 -崒 -崌 -崡 -铏 -铕 -铖 -铘 -铚 -铞 -铥 -铴 -牻 -牿 -稆 -笱 -笯 -偰 -偡 -鸺 -偭 -偲 -偁 -㿠 -鄅 -偓 -徛 -衒 -舳 -舲 -鸼 -悆 -鄃 -瓻 -䝙 -脶 -脞 -脟 -䏲 -鱾 -猇 -猊 -猄 -觖 -𠅤 -庱 -庼 -庳 -痓 -䴔 -竫 -堃 -阌 -羝 -羕 -焆 -烺 -焌 -淏 -淟 -淜 -淴 -淯 -湴 -涴 -㥄 -惛 -惔 -悰 -惙 -寁 -逭 -袼 -裈 -祲 -谞 -艴 -弸 -弶 -隃 -婞 -娵 -婼 -媖 -婳 -婍 -婌 -婫 -婤 -婘 -婠 -绹 -骕 -絜 -珷 -琲 -琡 -琟 -琔 -琭 -堾 -堼 -揕 -㙘 -堧 -喆 -堨 -塅 -堠 -絷 -𡎚 -葜 -惎 -萳 -葙 -靬 -葴 -蒇 -蒈 -鄚 -蒉 -蓇 -萩 -蒐 -葰 -葎 -鄑 -蒎 -葖 -蒄 -萹 -棤 -棽 -棫 -椓 -椑 -鹀 -椆 -棓 -棬 -棪 -椀 -楗 -甦 -酦 -觌 -奡 -皕 -硪 -欹 -詟 -辌 -棐 -龂 -黹 -牚 -睎 -晫 -晪 -晱 -𧿹 -蛑 -畯 -斝 -喤 -崶 -嵁 -崾 -嵅 -崿 -嵚 -翙 -圌 -圐 -赑 -淼 -赒 -铹 -铽 -𨱇 -锊 -锍 -锎 -锓 -犇 -颋 -稌 -筀 -筘 -筜 -筥 -筅 -傃 -傉 -翛 -傒 -傕 -舾 -畬 -脿 -腘 -䐃 -腙 -腒 -鲃 -猰 -猯 -㺄 -馉 -鄗 -廋 -廆 -鄌 -粢 -遆 -旐 -焞 -欻 -𣸣 -溚 -溁 -湝 -渰 -湓 -㴔 -渟 -溠 -渼 -溇 -湣 -湑 -溞 -愐 -愃 -敩 -甯 -棨 -扊 -裣 -祼 -婻 -媆 -媞 -㛹 -媓 -媂 -媄 -毵 -矞 -缊 -缐 -骙 -瑃 -瑓 -瑅 -瑆 -䴖 -瑖 -瑝 -瑔 -瑀 -𤧛 -瑳 -瑂 -嶅 -瑑 -遘 -髢 -塥 -堽 -赪 -摛 -塝 -搒 -搌 -蒱 -蒨 -蓏 -蔀 -蓢 -蓂 -蒻 -蓣 -椹 -楪 -榃 -榅 -楒 -楞 -楩 -榇 -椸 -楙 -歅 -碃 -碏 -碈 -䃅 -硿 -鄠 -辒 -龆 -觜 -䣘 -暕 -鹍 -㬊 -暅 -跱 -蜐 -蜎 -嵲 -赗 -骱 -锖 -锘 -锳 -锧 -锪 -锫 -锬 -稑 -稙 -䅟 -筻 -筼 -筶 -筦 -筤 -傺 -鹎 -僇 -艅 -艉 -谼 -貆 -腽 -腨 -腯 -鲉 -鲊 -鲌 -䲟 -鲏 -雊 -猺 -飔 -觟 -𦝼 -馌 -裛 -廒 -瘀 -瘅 -鄘 -鹒 -鄜 -麀 -鄣 -阘 -煁 -煃 -煴 -煋 -煟 -煓 -滠 -溍 -溹 -滆 -滉 -溦 -溵 -漷 -滧 -滘 -滍 -愭 -慥 -慆 -塱 -裼 -禋 -禔 -禘 -禒 -谫 -鹔 -愍 -嫄 -媱 -戤 -戣 -缞 -耤 -瑧 -瑨 -瑱 -瑷 -瑢 -斠 -摏 -墕 -墈 -墐 -墘 -摴 -銎 -𡐓 -墚 -撖 -靽 -鞁 -蔌 -蔈 -蓰 -蔹 -蔊 -嘏 -榰 -榑 -槚 -𣗋 -槜 -榍 -疐 -酺 -酾 -酲 -酴 -碶 -䃎 -碨 -𥔲 -碹 -碥 -劂 -䴗 -夥 -瞍 -鹖 -㬎 -跽 -蜾 -幖 -嶍 -圙 -𨱏 -锺 -锼 -锽 -锾 -锿 -镃 -镄 -镅 -馝 -鹙 -箨 -箖 -劄 -僬 -僦 -僔 -僎 -槃 -㙦 -鲒 -鲕 -鲖 -鲗 -鲘 -鲙 -𩽾 -夐 -獍 -飗 -凘 -廑 -廙 -瘗 -瘥 -瘕 -鲝 -鄫 -熇 -漹 -漖 -潆 -漤 -潩 -漼 -漴 -㽏 -漈 -漋 -漻 -慬 -窬 -窭 -㮾 -褕 -禛 -禚 -隩 -嫕 -嫭 -嫜 -嫪 -㻬 -麹 -璆 -漦 -叇 -墣 -墦 -墡 -劐 -薁 -蕰 -蔃 -鼒 -槱 -鹝 -磏 -磉 -殣 -慭 -霅 -暵 -暲 -暶 -踦 -踣 -䗖 -蝘 -蝲 -蝤 -噇 -噂 -噀 -罶 -嶲 -嶓 -㠇 -嶟 -嶒 -镆 -镈 -镋 -镎 -镕 -稹 -儇 -皞 -皛 -䴘 -艎 -艏 -鹟 -𩾃 -鲦 -鲪 -鲬 -橥 -觭 -鹠 -鹡 -糇 -糈 -翦 -鹢 -鹣 -熛 -潖 -潵 -㵐 -澂 -澛 -瑬 -潽 -潾 -潏 -憭 -憕 -戭 -褯 -禤 -嫽 -遹 -璥 -璲 -璒 -憙 -擐 -鄹 -薳 -鞔 -黇 -蕗 -薢 -蕹 -橞 -橑 -橦 -醑 -觱 -磡 -𥕢 -磜 -豮 -鹾 -虤 -暿 -曌 -曈 -㬚 -蹅 -踶 -䗛 -螗 -疁 -㠓 -幪 -嶦 -𨱑 -馞 -穄 -篚 -篯 -簉 -鼽 -衠 -盦 -螣 -縢 -鲭 -鲯 -鲰 -鲺 -鲹 -亸 -癀 -瘭 -羱 -糒 -燋 -熻 -燊 -燚 -燏 -濩 -濋 -澪 -澽 -澴 -澭 -澼 -憷 -憺 -懔 -黉 -嬛 -鹨 -翯 -璱 -𤩽 -璬 -璮 -髽 -擿 -薿 -薸 -檑 -櫆 -檞 -醨 -繄 -磹 -磻 -瞫 -瞵 -蹐 -蟏 -㘎 -镤 -镥 -镨 -𨱔 -矰 -穙 -穜 -穟 -簕 -簃 -簏 -儦 -魋 -斶 -艚 -谿 -䲠 -鲾 -鲿 -鳁 -鳂 -鳈 -鳉 -獯 -䗪 -馘 -襕 -襚 -螱 -甓 -嬬 -嬥 -𦈡 -瓀 -釐 -鬶 -爇 -鞳 -鞮 -藟 -藦 -藨 -鹲 -檫 -黡 -礞 -礌 -𥖨 -蹢 -蹜 -蟫 -䗴 -嚚 -髃 -镮 -镱 -酂 -馧 -簠 -簝 -簰 -鼫 -鼩 -皦 -臑 -䲢 -鳑 -鳒 -鹱 -鹯 -癗 -𦒍 -旞 -翷 -冁 -䎖 -瀔 -瀍 -瀌 -襜 -䴙 -嚭 -㰀 -鬷 -醭 -蹯 -蠋 -翾 -鳘 -儳 -儴 -鼗 -𩾌 -鳚 -鳛 -麑 -麖 -蠃 -彟 -嬿 -鬒 -蘘 -欂 -醵 -颥 -甗 -𨟠 -巇 -酅 -髎 -犨 -𨭉 -㸌 -爔 -瀱 -瀹 -瀼 -瀵 -襫 -孅 -骦 -耰 -𤫉 -瓖 -鬘 -趯 -罍 -鼱 -鳠 -鳡 -鳣 -爟 -爚 -灈 -韂 -糵 -蘼 -礵 -鹴 -躔 -皭 -龢 -鳤 -亹 -籥 -鼷 -玃 -醾 -齇 -觿 -蠼 -𬣙 -𬇕 -𬣞 -𬘓 -𫭟 -𫭢 -𫇭 -𫐄 -𫵷 -𬇙 -𬣡 -𫸩 -𫘜 -𬘘 -𫘝 -𬨂 -𬀩 -𬀪 -𬬩 -𫍣 -𬣳 -𬩽 -𬮿 -𬯀 -𫰛 -𬳵 -𬳶 -𫠊 -𬍛 -鿍 -𬜬 -𪾢 -𪨰 -𫓧 -𬬮 -𬬱 -𬬭 -𬘡 -𬳽 -𬘩 -𫄧 -𪟝 -𬍤 -𫭼 -𬜯 -𬂩 -𫠆 -𬌗 -𫑡 -𪨶 -𬬸 -𬬻 -𬬹 -𬬿 -𬭁 -𫢸 -𫗧 -𬊈 -𬒈 -𬳿 -𫄨 -𬘫 -𫮃 -鿎 -𬱖 -𬟽 -𫓯 -𫟹 -𫟼 -𬇹 -𬍡 -𬤇 -𫍯 -𬤊 -𫍲 -𬯎 -𬘬 -𬘭 -𬴂 -𫘦 -𫟅 -𬘯 -𫘧 -𪣻 -𬃊 -𬷕 -𫐐 -𬹼 -𫶇 -𫖮 -鿏 -𬭊 -𫓶 -𬭎 -𫖯 -𬱟 -𫛭 -𫷷 -𬮱 -𬊤 -𬴃 -𫘨 -𬪩 -𬒔 -𬨎 -𫐓 -𫫇 -𫓹 -𬭚 -𬭛 -𬕂 -𬶋 -𬶍 -𫔶 -𫌀 -𫖳 -𫘪 -𫘬 -𫞩 -𪤗 -𬸘 -𬒗 -𫚖 -𬭤 -𫚕 -𬶐 -𬶏 -𬸚 -𬤝 -𬙂 -𬭩 -𬸣 -𫍽 -𬴊 -𬞟 -𫟦 -𬺈 -𫠜 -𪩘 -𬭬 -𬭯 -𫗴 -𬸦 -𫄷 -𬭳 -𬭶 -𫔍 -𬭸 -𬭼 -𫔎 -𬸪 -𬶟 -𬶠 -𬶨 -𫄸 -𬟁 -𬙊 -𬶭 -𬶮 -𬙋 -𬺓 -𫚭 -廠 -蔔 -兒 -幾 -幹 -虧 -纔 -與 -萬 -韆 -億 -個 -廣 -門 -義 -衛 -飛 -習 -馬 -鄉 -豐 -開 -無 -雲 -專 -藝 -廳 -區 -歷 -曆 -車 -貝 -岡 -見 -氣 -長 -僕 -幣 -僅 -從 -侖 -倉 -風 -烏 -鳳 -爲 -鬥 -憶 -計 -訂 -認 -譏 -醜 -隊 -辦 -鄧 -勸 -雙 -書 -擊 -撲 -節 -術 -厲 -龍 -滅 -軋 -東 -盧 -業 -舊 -帥 -歸 -葉 -電 -號 -衹 -隻 -嘰 -嘆 -們 -儀 -叢 -爾 -樂 -處 -鼕 -鳥 -務 -飢 -饑 -馮 -閃 -蘭 -匯 -彙 -頭 -漢 -寧 -討 -寫 -讓 -禮 -訓 -議 -訊 -記 -齣 -遼 -邊 -發 -髮 -聖 -對 -臺 -颱 -檯 -糾 -絲 -動 -鞏 -執 -擴 -掃 -場 -揚 -亞 -樸 -機 -權 -過 -協 -壓 -厭 -頁 -誇 -奪 -達 -夾 -軌 -堯 -劃 -邁 -畢 -貞 -師 -塵 -當 -噹 -籲 -嚇 -蟲 -麯 -團 -糰 -嗎 -嶼 -歲 -迴 -豈 -則 -剛 -網 -硃 -遷 -喬 -偉 -傳 -優 -傷 -價 -倫 -華 -僞 -嚮 -後 -會 -殺 -閤 -衆 -爺 -傘 -創 -雜 -負 -壯 -衝 -妝 -莊 -慶 -劉 -齊 -産 -閉 -問 -闖 -關 -燈 -湯 -興 -講 -諱 -軍 -訝 -許 -訛 -論 -訟 -農 -諷 -設 -訪 -訣 -尋 -盡 -儘 -導 -孫 -陣 -陽 -階 -陰 -婦 -媽 -戲 -觀 -歡 -買 -紅 -馱 -纖 -縴 -馴 -約 -級 -紀 -馳 -紉 -壽 -麥 -瑪 -進 -遠 -違 -韌 -運 -撫 -壇 -罎 -壞 -摳 -擾 -貢 -垻 -壩 -摺 -掄 -搶 -墳 -護 -殻 -塊 -聲 -報 -擬 -蕪 -葦 -蒼 -嚴 -蘆 -勞 -蘇 -囌 -極 -楊 -兩 -麗 -醫 -勵 -還 -殲 -來 -連 -軒 -鹵 -滷 -堅 -時 -縣 -裏 -嘔 -園 -曠 -圍 -噸 -郵 -睏 -員 -聽 -嗆 -嗚 -彆 -嶇 -崗 -帳 -財 -針 -釘 -亂 -體 -傭 -徹 -餘 -穀 -鄰 -腸 -龜 -猶 -狽 -條 -島 -飯 -飲 -係 -繫 -凍 -狀 -畝 -庫 -療 -應 -這 -廬 -閏 -閑 -間 -悶 -竈 -燦 -瀝 -淪 -滄 -溝 -滬 -瀋 -懷 -憂 -窮 -證 -啓 -評 -補 -識 -詐 -訴 -診 -詞 -譯 -靈 -層 -遲 -張 -際 -陸 -陳 -墜 -勁 -鷄 -緯 -驅 -純 -紗 -綱 -納 -駁 -縱 -紛 -紙 -紋 -紡 -驢 -紐 -環 -責 -現 -錶 -規 -攏 -揀 -擔 -頂 -擁 -勢 -攔 -擰 -撥 -擇 -蘋 -範 -莖 -樞 -櫃 -闆 -鬆 -槍 -楓 -構 -喪 -畫 -棗 -賣 -鬱 -礬 -礦 -碼 -厠 -奮 -態 -歐 -毆 -壟 -轟 -頃 -轉 -斬 -輪 -軟 -齒 -虜 -腎 -賢 -國 -暢 -嚨 -鳴 -羅 -幟 -嶺 -凱 -敗 -賬 -販 -貶 -購 -貯 -圖 -釣 -製 -颳 -俠 -僥 -偵 -側 -憑 -僑 -貨 -質 -徑 -捨 -覓 -貪 -貧 -膚 -腫 -脹 -骯 -脅 -魚 -獰 -備 -飾 -飽 -飼 -變 -龐 -廟 -瘧 -劑 -廢 -閘 -鬧 -鄭 -捲 -單 -爐 -淺 -濘 -瀉 -潑 -澤 -憐 -學 -寶 -寵 -審 -簾 -實 -試 -詩 -誠 -襯 -視 -話 -誕 -詭 -詢 -該 -詳 -肅 -録 -隸 -彌 -瀰 -陝 -駕 -參 -艱 -綫 -練 -組 -紳 -細 -駛 -織 -駒 -終 -駐 -絆 -駝 -紹 -繹 -經 -貫 -貳 -幫 -項 -挾 -撓 -趙 -擋 -墊 -擠 -揮 -薦 -帶 -繭 -蕩 -榮 -葷 -熒 -鬍 -蔭 -藥 -標 -棧 -棟 -欄 -檸 -樹 -鹹 -磚 -硯 -麵 -牽 -鷗 -殘 -軸 -輕 -鴉 -戰 -點 -臨 -覽 -竪 -嘗 -啞 -顯 -貴 -蝦 -蟻 -螞 -雖 -駡 -勛 -嘩 -響 -喲 -峽 -罰 -賤 -貼 -貽 -鈣 -鈍 -鈔 -鍾 -鐘 -鋼 -鈉 -鑰 -欽 -鈞 -鈎 -鈕 -氈 -氫 -選 -適 -種 -鞦 -復 -複 -倆 -貸 -順 -儉 -須 -鬚 -劍 -朧 -膽 -勝 -狹 -獅 -獨 -獄 -貿 -餌 -饒 -蝕 -餃 -餅 -巒 -彎 -將 -奬 -瘡 -瘋 -親 -閨 -聞 -閩 -閥 -閣 -養 -薑 -類 -婁 -總 -煉 -爍 -爛 -窪 -潔 -灑 -澆 -濁 -測 -瀏 -濟 -渾 -濃 -惱 -舉 -覺 -憲 -竊 -誡 -誣 -語 -襖 -誤 -誘 -誨 -説 -誦 -墾 -晝 -費 -遜 -隕 -險 -嬌 -賀 -壘 -綁 -絨 -結 -繞 -驕 -繪 -給 -絢 -駱 -絡 -絶 -絞 -駭 -統 -艷 -蠶 -頑 -盞 -撈 -載 -趕 -鹽 -損 -撿 -摯 -剝 -熱 -搗 -壺 -聶 -萊 -蓮 -獲 -穫 -惡 -噁 -瑩 -鶯 -檔 -橋 -樺 -樁 -樣 -賈 -礫 -礎 -顧 -轎 -較 -頓 -斃 -緻 -慮 -監 -緊 -黨 -曬 -曉 -嘮 -鴨 -暈 -鴦 -罷 -圓 -賊 -賄 -賂 -贜 -錢 -鉗 -鑽 -鉀 -鐵 -鈴 -鉛 -犧 -敵 -積 -稱 -筆 -債 -傾 -賃 -艦 -艙 -聳 -愛 -頒 -頌 -臟 -髒 -臍 -膠 -腦 -膿 -鴕 -鴛 -皺 -餓 -餒 -戀 -槳 -漿 -準 -癥 -齋 -離 -資 -競 -閲 -煩 -燒 -燭 -遞 -濤 -澇 -渦 -塗 -滌 -潤 -澗 -漲 -燙 -澀 -憫 -寬 -傢 -賓 -竅 -請 -諸 -諾 -讀 -誹 -襪 -課 -誰 -調 -諒 -諄 -談 -誼 -懇 -劇 -難 -預 -絹 -綉 -驗 -繼 -駿 -瑣 -擲 -據 -摻 -職 -蘿 -螢 -營 -蕭 -薩 -夢 -檢 -醖 -碩 -聾 -襲 -輔 -輛 -顱 -懸 -躍 -纍 -囉 -嘯 -嶄 -邏 -嬰 -銬 -鐺 -鋁 -銅 -銘 -鏟 -銀 -矯 -穢 -籠 -償 -軀 -釁 -銜 -盤 -鴿 -斂 -領 -臉 -獵 -餡 -館 -癢 -鏇 -閻 -闡 -蓋 -斷 -獸 -鴻 -漸 -淵 -漁 -澱 -滲 -慚 -懼 -驚 -慘 -慣 -謀 -諜 -謊 -諧 -禱 -禍 -謂 -諺 -謎 -彈 -墮 -隨 -隱 -嬸 -頗 -頸 -績 -緒 -續 -騎 -綽 -繩 -維 -綿 -綳 -綢 -綜 -綻 -緑 -綴 -瓊 -趨 -攬 -攙 -擱 -摟 -攪 -聯 -蔣 -韓 -橢 -確 -頰 -靂 -暫 -翹 -輩 -鑿 -輝 -賞 -睞 -噴 -疇 -踐 -遺 -鵑 -賦 -賭 -贖 -賜 -賠 -鑄 -鋪 -鏈 -銷 -鎖 -鋤 -鍋 -銹 -鋒 -鋅 -鋭 -鵝 -築 -篩 -儲 -懲 -禦 -釋 -臘 -魯 -憊 -饋 -饞 -裝 -蠻 -闊 -糞 -滯 -濕 -潰 -濺 -灣 -憤 -竄 -窩 -褲 -禪 -謝 -謡 -謗 -謙 -屬 -屢 -緬 -纜 -緝 -緞 -緩 -締 -縷 -騙 -編 -騷 -緣 -鵡 -攝 -擺 -襬 -攤 -鵲 -藍 -濛 -懞 -矇 -獻 -欖 -樓 -賴 -礙 -尷 -霧 -輻 -輯 -輸 -頻 -齡 -鑒 -蹺 -蝸 -錯 -錨 -錫 -鑼 -錘 -錐 -錦 -鍵 -鋸 -錳 -辭 -頽 -籌 -簽 -籤 -簡 -膩 -鵬 -騰 -鮑 -穎 -觸 -雛 -饃 -餾 -醬 -謄 -糧 -數 -滿 -濾 -濫 -灕 -濱 -灘 -譽 -窺 -寢 -謹 -謬 -闢 -縛 -縫 -纏 -繽 -贅 -墻 -衊 -藹 -檻 -釀 -願 -轄 -輾 -顆 -踴 -蠟 -蠅 -蟬 -賺 -鍬 -鍛 -鍍 -穩 -籮 -簫 -輿 -鮮 -饅 -瀟 -賽 -譚 -譜 -騾 -縮 -攆 -聰 -藴 -櫻 -飄 -黴 -瞞 -題 -囑 -鎮 -鎬 -鎊 -簍 -鯉 -鯽 -癟 -癱 -顔 -鯊 -瀾 -額 -譴 -鶴 -繚 -顛 -轍 -鸚 -贈 -鏡 -贊 -籃 -籬 -鯨 -癮 -辯 -瀕 -懶 -繮 -繳 -矚 -贍 -鰐 -辮 -贏 -驟 -囂 -鐮 -鰭 -鷹 -巔 -顫 -癬 -鱉 -鬢 -鱗 -躪 -贛 -鑲 -韋 -閂 -訃 -勱 -芻 -鄺 -訐 -訌 -訕 -訖 -馭 -璣 -壙 -捫 -薌 -厙 -釔 -傴 -倀 -傖 -獷 -獁 -鳬 -鄔 -餳 -懺 -謳 -詎 -訥 -紆 -紂 -紇 -紈 -璵 -摶 -塢 -㩳 -蕓 -藶 -莧 -萇 -蓯 -磯 -奩 -歟 -軔 -鄴 -嘸 -囈 -嚦 -暘 -唄 -幃 -峴 -嵐 -圇 -釗 -釙 -釕 -僉 -鳩 -鄒 -飩 -餼 -飪 -飫 -飭 -廡 -癤 -闈 -閎 -閔 -煬 -灃 -漚 -渢 -潙 -憮 -慪 -愾 -悵 -愴 -詁 -訶 -詛 -詆 -謅 -詔 -詒 -隴 -陘 -嫵 -嫗 -嬀 -剄 -紜 -紕 -紝 -綸 -紓 -瑋 -匭 -壚 -擓 -蘢 -蔦 -塋 -煢 -櫪 -梘 -棖 -樅 -碭 -甌 -郟 -軛 -鳶 -曇 -蟣 -黽 -嚀 -噝 -巋 -劌 -剴 -嶧 -釷 -釺 -釧 -釩 -釹 -釵 -儈 -儕 -儂 -劊 -慫 -糴 -戧 -膞 -邇 -梟 -餞 -飴 -癘 -瘍 -煒 -熰 -熗 -瀧 -瀘 -濼 -涇 -㥮 -懌 -誆 -誄 -詿 -詰 -詼 -鄆 -禕 -誅 -詵 -詬 -詮 -詣 -諍 -詫 -諢 -詡 -駑 -紺 -紲 -紱 -駟 -駙 -縐 -絀 -驛 -駘 -瓏 -頇 -埡 -撾 -撻 -賁 -壋 -撏 -莢 -貰 -蓽 -蕎 -薈 -薺 -堊 -滎 -犖 -蕁 -藎 -蓀 -蕒 -葤 -櫛 -櫳 -櫨 -櫟 -檉 -酈 -硨 -碸 -殤 -軲 -軻 -轤 -軼 -軫 -蠆 -覘 -瞘 -嘵 -嗶 -噦 -剮 -鄖 -噲 -噥 -嶢 -幀 -嶠 -貺 -鈈 -鈦 -鋇 -鈑 -鈐 -鎢 -鈁 -鈀 -篤 -儔 -儼 -儷 -腖 -臚 -脛 -鴇 -獪 -颮 -猻 -餉 -餄 -餎 -孿 -孌 -癧 -瘲 -颯 -闥 -閭 -闓 -閡 -熾 -烴 -浹 -澮 -滸 -潯 -濜 -慟 -懨 -愷 -惻 -惲 -誚 -禰 -誥 -誑 -鴆 -婭 -嬈 -懟 -絝 -驍 -驊 -絎 -絳 -駢 -頊 -璫 -琿 -塒 -塤 -堝 -贄 -蒔 -萵 -蕕 -鴣 -蒓 -橈 -楨 -榿 -檜 -邐 -礪 -礱 -軾 -輊 -輅 -鶇 -躉 -齔 -鸕 -矓 -嘜 -鴞 -蜆 -嗩 -嶗 -崍 -覬 -賅 -鈺 -鉦 -鈷 -鉢 -鈸 -鉞 -鉭 -鉬 -鈿 -鈾 -鉑 -鑠 -鉚 -鈰 -鉉 -鉈 -鉍 -鈮 -鈹 -鏺 -鐸 -氬 -筧 -頎 -徠 -膾 -鴟 -璽 -鴝 -獫 -裊 -餑 -欒 -攣 -癰 -痙 -頏 -閫 -鬮 -誾 -閬 -鄲 -燁 -燴 -燼 -淶 -漣 -潿 -慳 -諏 -諑 -禎 -諉 -諛 -諗 -諂 -誶 -媧 -嫻 -綆 -驪 -綃 -騁 -綏 -縧 -綈 -駸 -鷥 -燾 -璉 -麩 -擄 -摑 -鷙 -撣 -慤 -摜 -縈 -槤 -覡 -欞 -嗇 -匱 -硤 -磽 -鴯 -龔 -殞 -殮 -賚 -輒 -塹 -嘖 -囀 -嚙 -蹌 -蠣 -蠱 -蟶 -幘 -幗 -賕 -賑 -賒 -銠 -鉺 -鋏 -鐃 -銦 -鎧 -鍘 -銖 -銑 -鋌 -鏵 -銓 -鎩 -鉿 -銚 -鉻 -錚 -銫 -鉸 -銥 -銃 -銨 -銣 -鴰 -穠 -箋 -籩 -僨 -僂 -皚 -鴴 -艫 -龕 -玀 -獼 -餜 -餛 -鸞 -闍 -閾 -閹 -閶 -鬩 -閽 -閼 -羥 -糲 -燜 -漬 -瀆 -澠 -愜 -憚 -諶 -諫 -皸 -謔 -襠 -謁 -諤 -諭 -諼 -讒 -諳 -諦 -諞 -糶 -嬋 -綾 -騏 -綺 -緋 -緔 -騍 -緄 -騅 -綬 -綹 -綣 -綰 -驂 -緇 -靚 -輦 -黿 -頡 -撳 -蟄 -壪 -蔞 -櫝 -欏 -賫 -鵓 -鸝 -殫 -輥 -輞 -槧 -輟 -輜 -瞼 -躒 -蛺 -蟯 -螄 -蠐 -嘍 -嶸 -嶁 -賧 -鋙 -錸 -鏗 -鋥 -鋰 -鋯 -鋨 -銼 -鐧 -銻 -鋃 -鋦 -錒 -犢 -鵠 -篳 -牘 -儻 -儐 -儺 -嬃 -頜 -鵒 -魷 -魨 -魴 -潁 -颶 -觴 -熲 -餷 -餿 -褻 -臠 -癆 -癇 -賡 -頦 -鷳 -闌 -闃 -闋 -鵜 -憒 -嚳 -謨 -褳 -襇 -讜 -謖 -謚 -謐 -騭 -巰 -翬 -騖 -緙 -緗 -緘 -緹 -緲 -緦 -緱 -縋 -緡 -饗 -耮 -驁 -韞 -攄 -擯 -轂 -驀 -鶓 -薊 -蘺 -鎣 -頤 -櫚 -櫸 -磧 -磣 -鵪 -輳 -齟 -齙 -韙 -囁 -躂 -蹕 -躚 -躋 -噯 -鍺 -錛 -錡 -鍀 -錁 -錕 -錮 -鍁 -錈 -錠 -錙 -覦 -頷 -鮁 -鮃 -鮎 -鱸 -穌 -鮒 -鮐 -鵮 -颼 -饈 -鶉 -瘮 -闔 -闐 -闕 -灧 -瀅 -潷 -灤 -澦 -懾 -鱟 -騫 -竇 -謾 -謫 -嬡 -嬪 -縉 -縝 -縟 -轡 -騮 -縞 -縭 -縊 -縑 -騸 -覯 -韜 -靉 -攖 -薔 -藺 -鶘 -檳 -櫧 -釅 -殯 -霽 -轅 -齜 -齦 -瞜 -曖 -躊 -蟈 -鶚 -嚶 -羆 -賻 -罌 -鶻 -鍥 -鍇 -鍶 -鍔 -鍤 -鏘 -鎂 -鏤 -簀 -篋 -簞 -籙 -臏 -鮭 -鮪 -鱭 -鮫 -鱘 -饉 -鑾 -瘻 -闞 -鮝 -糝 -鷀 -瀲 -濰 -譖 -褸 -譙 -讕 -譎 -鶥 -嬙 -鶩 -驃 -縹 -縵 -縲 -纓 -驄 -繆 -繅 -耬 -瓔 -擷 -擼 -攛 -聵 -覲 -韃 -鞽 -蘄 -賾 -檣 -靨 -魘 -饜 -轆 -齬 -齪 -覷 -顒 -躓 -躑 -蠑 -螻 -顎 -嚕 -顓 -鑷 -鎘 -鎸 -鎳 -鎦 -鎰 -鎵 -鑌 -簣 -鷂 -鯁 -鱺 -鰱 -鰹 -鰣 -鯀 -鯇 -觶 -饊 -饌 -齏 -讞 -襤 -譫 -屨 -纈 -繕 -繒 -驏 -擻 -顳 -顢 -藪 -櫓 -櫞 -贋 -飆 -鏨 -轔 -蟎 -鐯 -鏢 -鏜 -鏝 -鏰 -鏞 -鏑 -鏃 -鏐 -氌 -穡 -魎 -鯪 -鯡 -鯤 -鯧 -鯝 -鯢 -鯛 -鯔 -獺 -鷓 -贇 -癭 -斕 -瀨 -顙 -繾 -繰 -繯 -蘚 -鷯 -齲 -齷 -躡 -蹣 -羈 -鐔 -鐝 -鐐 -鐓 -鑭 -鑹 -鏹 -鐙 -籪 -鷦 -鱝 -鰈 -鯷 -鰓 -鰍 -鰉 -鯿 -鷲 -懣 -鷸 -鰲 -韉 -顥 -鷺 -䴉 -髏 -鑊 -鐳 -鐲 -讎 -鰨 -鰥 -鰩 -癩 -攢 -靄 -躥 -髖 -髕 -鑔 -籟 -鰳 -鰾 -鱈 -鰻 -鱅 -讖 -驥 -纘 -瓚 -鼉 -黷 -黲 -鑣 -鑞 -臢 -鱖 -鱔 -鱒 -驤 -顰 -鱧 -癲 -灝 -鸛 -鑱 -趲 -顴 -躦 -饢 -戇 -戔 -訏 -訒 -釓 -俔 -閆 -澫 -訢 -訩 -詝 -紃 -纊 -瑒 -剗 -塸 -壢 -埨 -撝 -蔿 -榪 -軑 -軏 -咼 -㠣 -覎 -㑳 -颺 -閌 -潕 -湋 -澐 -浿 -諓 -禡 -詗 -詘 -詖 -屓 -彄 -紘 -馹 -馼 -紵 -紞 -駃 -紖 -瑲 -薴 -棡 -軝 -暐 -晛 -崬 -釴 -釤 -鍆 -鍚 -鄶 -獮 -飿 -嶨 -詷 -詪 -鄩 -鳲 -隑 -隮 -娙 -逕 -駓 -駔 -駉 -絅 -騶 -䮄 -紼 -紿 -瓅 -韍 -墶 -塏 -薘 -蕘 -蔄 -葒 -鳾 -龑 -軹 -軤 -轢 -軺 -睍 -曨 -噠 -鈃 -鈇 -鉅 -鋹 -釿 -錀 -鈧 -鈥 -鈄 -倈 -艤 -鶬 -颭 -餏 -湞 -溮 -滻 -褘 -絰 -駰 -絪 -駪 -綎 -綖 -驫 -勣 -璕 -𡑍 -䓣 -薟 -藭 -椏 -梜 -頍 -硜 -輄 -輈 -輇 -貲 -嗊 -曄 -暉 -鄳 -幬 -輋 -嶮 -贐 -鉥 -鉕 -鑪 -鉮 -鉊 -鉧 -僤 -鴒 -魛 -餗 -燖 -溳 -礐 -窵 -襏 -駼 -絺 -綌 -騂 -綄 -璡 -墠 -壼 -聹 -蘀 -勩 -罃 -檮 -棶 -厴 -䃮 -磑 -礄 -鴷 -齕 -頔 -廼 -凢 -亾 -枒 -屍 -匃 -匄 -紥 -紮 -疋 -殀 -讐 -觔 -兇 -宂 -㕥 -㠯 -栞 -佈 -佔 -呌 -敂 -冄 -坵 -僊 -怱 -悤 -冊 -夘 -戼 -牠 -妳 -嬭 -摃 -釦 -攷 -託 -衺 -衕 -弔 -喫 -囙 -㠶 -颿 -秊 -倣 -髣 -佀 -朶 -氷 -決 -併 -並 -竝 -汙 -汚 -異 -姦 -廵 -挵 -衖 -搤 -阯 -撦 -埳 -阬 -誌 -㕁 -卻 -刦 -刧 -刼 -芲 -蘤 -桿 -槓 -荳 -獃 -唫 -脗 -皁 -彿 -髴 -疘 -刪 -鉋 -鑤 -況 -牀 -恡 -棄 -洶 -汎 -災 -烖 -菑 -禩 -侷 -跼 -坿 -玅 -姉 -妬 -翫 -搨 -柺 -拕 -牴 -觝 -倖 -抝 -盃 -桮 -傑 -逩 -肎 -菓 -崐 -崑 -呪 -虖 -嘑 -謼 -詠 -㟁 -嵒 -巗 -巖 -雰 -稈 -咊 -嶽 -妷 -姪 -廹 -徃 -餚 -採 -寀 -唸 -週 -昬 -兎 -兔 -亯 -亱 -䘚 -淨 -劵 -匟 -㳒 -灋 -洩 -霑 -淚 -註 -恠 -箒 -屆 -絃 -圅 -旾 -珎 -掛 -垜 -艸 -茘 -査 -栢 -柵 -栁 -桺 -柹 -韮 -揹 -昰 -閧 -鬨 -冐 -暎 -嚥 -倃 -𠴰 -偺 -喒 -齩 -欬 -榘 -㑺 -儁 -敍 -敘 -肧 -脈 -䘑 -衇 -跡 -蹟 -砲 -礮 -薙 -鬀 -恆 -怳 -卹 -䘏 -賉 -婣 -畊 -揑 -綑 -輓 -恥 -躭 -晉 -棲 -覈 -慄 -翄 -脣 -槕 -㨪 -螡 -蟁 -㤙 -陗 -峩 -峯 -乗 -椉 -咲 -筍 -俛 -頫 -勌 -䠶 -躳 -慇 -拏 -㧱 -挐 -脃 -胷 -肐 -貍 -㽞 -畱 -淒 -悽 -蓆 -効 -傚 -涼 -缾 -菸 -煙 -淛 -湧 -誖 -猂 -醼 -讌 -㝠 -寃 -孃 -桒 -毬 -瑠 -璢 -瑯 -㨗 -搥 -搯 -蔆 -惏 -楳 -槑 -捄 -廂 -慽 -慼 -瞇 -埜 -畧 -虵 -稭 -棃 -犂 -迻 -媮 -兠 -舩 -慾 -綵 -腳 -𩓐 -夠 -豬 -貓 -湊 -減 -庻 -蔴 -菴 -朢 -睠 -觕 -麤 -釬 -銲 -痳 -殽 -婬 -滛 -湻 -㴱 -樑 -顇 -㝛 -窰 -窯 -琹 -欵 -墖 -趂 -隄 -愽 -揷 -揫 -煑 -朞 -㪚 -塟 -蔥 -蔕 -稜 -棊 -碁 -椶 -偪 -㕑 -廚 -廈 -鴈 -冣 -㝡 -晳 -鼃 -餧 -餵 -嗁 -諠 -㡌 -賸 -筴 -筞 -筩 -栰 -暠 -皜 -踰 -蝟 -㪟 -燄 -遊 -媿 -嘅 -庽 -窓 -牎 -牕 -窻 -徧 -僱 -帬 -裠 -強 -彊 -疎 -壻 -瓌 -䰟 -皷 -擕 -㩗 -㩦 -攜 -懃 -鞾 -幙 -㮣 -酧 -詶 -醻 -掽 -踫 -㼝 -盌 -磟 -覩 -倸 -㬉 -煗 -煖 -晻 -闇 -炤 -跥 -䗬 -蠭 -寘 -辠 -稺 -穉 -燬 -譭 -瘉 -癒 -顋 -骽 -猨 -蝯 -稟 -痺 -癡 -亷 -㢘 -韻 -泝 -遡 -昚 -躶 -臝 -羣 -㬪 -曡 -疊 -勦 -琍 -瓈 -𤋮 -熈 -牓 -搾 -謌 -堿 -鹻 -鹼 -矁 -燻 -髈 -𤺥 -辢 -旂 -𡚁 -潄 -砦 -詧 -嫰 -櫈 -撐 -墪 -譔 -鞵 -鞌 -蕋 -橤 -蘂 -醕 -譆 -跴 -蹤 -蜨 -蠍 -稾 -殭 -惪 -厀 -襃 -癅 -䊀 -餬 -潛 -癄 -顦 -鷰 -藷 -櫥 -螎 -蹏 -蟇 -譟 -簒 -彫 -琱 -鵰 -餹 -餻 -簷 -粦 -燐 -緐 -幑 -蹧 -粇 -穅 -臋 -籐 -繙 -飜 -孼 -蠏 -燿 -蝡 -稬 -穤 -惷 -覇 -鑵 -戹 -阨 -剳 -帀 -巵 -亙 -佇 -竚 -穽 -岅 -虯 -𦍑 -羗 -啎 -姙 -㘭 -袟 -袠 -逈 -㒺 -犛 -氂 -偘 -甕 -罋 -冺 -姍 -蝨 -琺 -瑇 -尅 -梔 -斮 -斲 -斵 -暱 -毘 -蝱 -吚 -哶 -峝 -粃 -竢 -狥 -秈 -烱 -㳄 -袵 -盇 -涖 -蒞 -碪 -蠔 -唕 -倐 -儵 -雋 -皐 -臯 -衂 -䶊 -臙 -獧 -痾 -皰 -湼 -澣 -濬 -塚 -襢 -娿 -勅 -勑 -戞 -廐 -廄 -眥 -覜 -勗 -啗 -噉 -傯 -挱 -㥫 -惥 -慂 -陻 -蕚 -萲 -蕿 -蘐 -藼 -櫂 -箠 -槨 -啑 -蹠 -蚘 -痐 -蛕 -蜖 -瘖 -遯 -醃 -飱 -冪 -簑 -枏 -柟 -檝 -楥 -矴 -椗 -嘷 -獋 -粺 -䈰 -諐 -齶 -堘 -疿 -雝 -秔 -稉 -槀 -搉 -廝 -叡 -嘠 -蜋 -筯 -篛 -麞 -糉 -緥 -璿 -髥 -臕 -餈 -剹 -橜 -罇 -蜺 -矙 -憇 -翺 -饍 -瞖 -羴 -羶 -爕 -繦 -騌 -鬉 -騣 -蔾 -䠀 -簮 -躕 -蹵 -䝔 -貛 -鼴 -麐 -塡 -あ -い -う -え -お -か -き -く -け -こ -さ -し -す -せ -そ -た -ち -つ -て -と -な -に -ぬ -ね -の -は -ひ -ふ -へ -ほ -ま -み -む -め -も -や -ゆ -よ -ら -り -る -れ -ろ -わ -を -ん -が -ぎ -ぐ -げ -ご -ざ -じ -ず -ぜ -ぞ -だ -ぢ -づ -で -ど -ば -び -ぶ -べ -ぼ -ぱ -ぴ -ぷ -ぺ -ぽ -ぁ -ぃ -ぅ -ぇ -ぉ -っ -ゃ -ゅ -ょ -ゎ -ゕ -ゖ -ア -イ -ウ -エ -オ -カ -キ -ク -ケ -コ -サ -シ -ス -セ -ソ -タ -チ -ツ -テ -ト -ナ -ニ -ヌ -ネ -ノ -ハ -ヒ -フ -ヘ -ホ -マ -ミ -ム -メ -モ -ヤ -ユ -ヨ -ラ -リ -ル -レ -ロ -ワ -ヲ -ン -ガ -ギ -グ -ゲ -ゴ -ザ -ジ -ズ -ゼ -ゾ -ダ -ヂ -ヅ -デ -ド -バ -ビ -ブ -ベ -ボ -パ -ピ -プ -ペ -ポ -ァ -ィ -ゥ -ェ -ォ -ッ -ャ -ュ -ョ -ヮ -ヵ -ヶ -ヷ -ヸ -ヹ -ヺ -・ -ー -ヽ -ヾ -ヿ -ア -イ -ウ -エ -オ -カ -キ -ク -ケ -コ -サ -シ -ス -セ -ソ -タ -チ -ツ -テ -ト -ナ -ニ -ヌ -ネ -ノ -ハ -ヒ -フ -ヘ -ホ -マ -ミ -ム -メ -モ -ヤ -ユ -ヨ -ラ -リ -ル -レ -ロ -ワ -ヲ -ン -゙ -゚ -ァ -ィ -ゥ -ェ -ォ -ッ -ャ -ュ -ョ -円 -気 -糸 -絵 -楽 -帰 -戸 -広 -黒 -図 -線 -読 -売 -歩 -毎 -亜 -悪 -圧 -扱 -囲 -為 -壱 -隠 -栄 -営 -駅 -塩 -縁 -艶 -応 -桜 -穏 -仮 -価 -箇 -ゑ -ゝ -ゞ -ヰ -ヴ -㈱ -両 -丼 -丿 -亀 -仏 -伝 -侶 -俤 -値 -倶 -倹 -偐 -偽 -働 -儛 -兌 -児 -冑 -冨 -凞 -処 -凪 -別 -剣 -剤 -剰 -劔 -労 -勧 -勲 -匁 -匂 -匲 -卍 -単 -厳 -収 -呂 -呉 -呑 -呰 -唖 -喚 -喩 -喰 -噛 -噺 -嚢 -囃 -団 -圀 -圏 -堀 -堺 -塀 -塁 -塙 -増 -墺 -壊 -壌 -壷 -変 -奨 -姫 -娯 -嫐 -嬢 -嬾 -孁 -宍 -実 -宮 -寔 -寛 -対 -専 -尭 -峠 -崋 -嶋 -巀 -巌 -巣 -巻 -帯 -幇 -庁 -廃 -廻 -弉 -弌 -弐 -弖 -弾 -従 -徳 -徴 -忯 -恵 -悩 -惣 -懐 -懽 -戦 -戯 -戻 -払 -抜 -択 -拝 -拠 -拡 -拵 -挙 -挿 -捗 -捜 -掟 -掲 -掻 -揃 -換 -揺 -摂 -撃 -撹 -斉 -斎 -旛 -旡 -晧 -晩 -暁 -暦 -曽 -杁 -杢 -杣 -杮 -枓 -枠 -枡 -柾 -栂 -栃 -桝 -桟 -桾 -梛 -梱 -梲 -梶 -椙 -検 -椥 -楕 -楡 -楢 -榊 -榎 -槇 -様 -槙 -槻 -樋 -権 -樫 -橿 -檥 -欅 -歎 -歓 -歯 -歳 -歴 -毀 -沖 -沢 -浄 -涙 -済 -渉 -渋 -渓 -渕 -満 -滝 -漑 -潅 -澁 -瀞 -瀬 -焔 -焼 -煇 -煕 -煥 -燗 -爼 -犠 -狛 -猟 -獏 -獣 -珊 -瑤 -甞 -畑 -畠 -畳 -畷 -畺 -痩 -癪 -発 -県 -眞 -砕 -碕 -礒 -禖 -禿 -稲 -穂 -穣 -竃 -竜 -竴 -笹 -筈 -筬 -筰 -箆 -箏 -箙 -篠 -篭 -簺 -籾 -粂 -粋 -粛 -粧 -糺 -紬 -絁 -経 -絖 -絣 -絽 -継 -続 -綟 -総 -縄 -縅 -縒 -縦 -繊 -繋 -繍 -繝 -繧 -纐 -纒 -罠 -罧 -罵 -羂 -羇 -羨 -聟 -聡 -聨 -聴 -脇 -脳 -膣 -膵 -臈 -臓 -臥 -舎 -舖 -舗 -舘 -芿 -苅 -茲 -荊 -荘 -莬 -莵 -菫 -萠 -蔵 -薗 -薫 -薬 -薭 -蘊 -蛍 -蝋 -蝿 -蟷 -衞 -衵 -袙 -袞 -袰 -袴 -袿 -裃 -裡 -裲 -褄 -褌 -襴 -襷 -覗 -覚 -覧 -観 -訳 -証 -諌 -諚 -諟 -諡 -諮 -譛 -譲 -讃 -豅 -豊 -豎 -賎 -賛 -贔 -躙 -躰 -転 -軽 -輌 -辥 -辺 -辻 -込 -逓 -遅 -遙 -邉 -郷 -酔 -醗 -醤 -醸 -釈 -鉄 -鉇 -鉤 -鉱 -鉾 -銈 -銕 -銭 -鋲 -鋳 -鋺 -錆 -錍 -錣 -錬 -錵 -鍑 -鍮 -鍼 -鎌 -鎗 -鎚 -鎹 -鐇 -鐚 -鐡 -鑁 -鑑 -鑚 -鑢 -閇 -関 -閦 -闘 -陥 -険 -隣 -隷 -雑 -雫 -霊 -靜 -靫 -靭 -靱 -鞄 -鞆 -頚 -頬 -頴 -頼 -顕 -顗 -餝 -饂 -駄 -駆 -駈 -騒 -験 -騨 -髄 -髙 -髪 -髷 -鯖 -鯰 -鯱 -鰒 -鰯 -鰰 -鳰 -鴎 -鴫 -鵄 -鵞 -鵺 -鶏 -鹸 -麁 -麺 -麿 -黌 -黙 -鼈 -齢 -龗 -縯 -蟅 -坖 -祂 -鼂 -鱚 -蛻 -屌 -呾 -煔 -吶 -扥 -蚖 -銂 -尃 -夋 -鵼 -徬 -寳 -彡 -舨 -湳 -麼 -鍈 -崈 -鱣 -盺 -拺 -瑥 -茷 -焻 -奀 -驎 -鱰 -砢 -痟 -廱 -僜 -瘺 -鱊 -擥 -嶰 -淓 -跅 -浵 -媗 -璦 -煠 -檊 -媃 -峅 -躄 -鉟 -塽 -蟴 -鯮 -弍 -烒 -鵵 -妑 -孋 -蚡 -恊 -輭 -廞 -產 -曅 -盜 -騤 -囪 -鱀 -茇 -葊 -逹 -狓 -崢 -趖 -凃 -羙 -鮸 -昞 -楿 -渽 -圗 -麪 -屇 -鍉 -葝 -沯 -爭 -幵 -筭 -寊 -銋 -貮 -鎭 -熺 -昜 -鍱 -墬 -愒 -磺 -嚈 -稘 -珮 -釆 -殑 -鍩 -䲁 -蕷 -鐿 -僡 -佹 -輶 -冴 -襶 -賔 -猙 -辧 -絛 -磾 -韁 -螔 -譳 -礑 -鋱 -魩 -嚗 -棆 -牆 -敟 -柶 -瓛 -魣 -巎 -轘 -襌 -枼 -鸌 -逺 -錏 -縡 -帢 -騄 -媼 -埅 -鄤 -萐 -祙 -旼 -詥 -鶲 -燉 -卲 -銱 -庲 -伱 -氽 -嵿 -挻 -煵 -窋 -鐤 -鮊 -鱬 -鰧 -嬤 -譞 -諲 -脭 -悳 -崘 -阭 -內 -袾 -冚 -壐 -咗 -礠 -孮 -痲 -埈 -肹 -鰮 -鮓 -濊 -塜 -凜 -蒢 -噰 -桼 -峍 -焴 -鶒 -鋮 -綠 -鶹 -熿 -毴 -咟 -嘥 -睺 -繡 -郎 -瘞 -鉶 -蔎 -秠 -緤 -蝀 -躝 -蟜 -繃 -囮 -墫 -乭 -胊 -濙 -瘓 -榣 -鑛 -鐫 -嶴 -甹 -坮 -銾 -蒭 -睜 -俋 -餠 -榢 -蓳 -盋 -堷 -鍏 -苝 -巛 -蚵 -暏 -熤 -嬨 -墎 -鏽 -戶 -菺 -膮 -熖 -睪 -栜 -捱 -榗 -鍷 -曧 -犽 -韑 -袓 -䖝 -焄 -喦 -髲 -疌 -㴪 -侊 -貐 -蕅 -禠 -蕑 -囯 -暊 -儞 -佋 -柎 -㐱 -鰤 -苳 -鱥 -謤 -遶 -眀 -鑀 -羋 -顏 -陜 -銩 -黶 -苼 -蒤 -棛 -儫 -咁 -抦 -衚 -棩 -焿 -脫 -麅 -玏 -埧 -淸 -黁 -淽 -彠 -鮨 -沜 -糀 -厓 -楧 -嶌 -簹 -檵 -鱇 -嶬 -廸 -卽 -樀 -贌 -酼 -籛 -沒 -晸 -諪 -蕡 -妏 -鄋 -蒍 -奧 -抇 -蓨 -薆 -鱷 -巘 -䝉 -亰 -寈 -槩 -誒 -麴 -蕟 -溎 -蘗 -榦 -斿 -暟 -炲 -拚 -娖 -繖 -橚 -寜 -爀 -饟 -悅 -鯏 -彜 -眾 -葯 -嬝 -埮 -獇 -馛 -溙 -瀦 -熼 -硓 -鈢 -樆 -輬 -鰜 -蔘 -渙 -澔 -嗮 -旉 -籜 -媊 -燘 -儚 -頹 -缽 -俽 -逨 -鱓 -郞 -歊 -杴 -珡 -杋 -醁 -鰏 -鵾 -鐽 -鮋 -巶 -荅 -薾 -囓 -蹻 -獎 -禑 -鎓 -榲 -僴 -綞 -尓 -敭 -曔 -褔 -鬅 -亊 -鏦 -蓘 -裬 -鱲 -薡 -鰗 -箑 -鬪 -縂 -璸 -甙 -茮 -辵 -岻 -覿 -滈 -鯶 -鑂 -囶 -舺 -溋 -拋 -菾 -敾 -虨 -綝 -蝍 -醂 -禨 -賹 -廧 -絕 -槗 -徫 -鎔 -曮 -蠂 -捒 -堈 -莕 -蓪 -敎 -禃 -櫱 -綧 -瀶 -逌 -浤 -碻 -刄 -逤 -剏 -氹 -菈 -娫 -蜛 -嵗 -糎 -螶 -譓 -鏳 -嵙 -瑊 -隲 -檨 -緈 -畵 -砯 -簗 -彅 -鰺 -騋 -窶 -嚒 -嵻 -尙 -頵 -槰 -虉 -醞 -巂 -彔 -偊 -畇 -鱨 -妸 -塲 -畐 -鈫 -錟 -磪 -摠 -彥 -璙 -囝 -寗 -耎 -鮡 -蘓 -弅 -焃 -飥 -戙 -塰 -儱 -槺 -噏 -魟 -禵 -佧 -咘 -盪 -瑈 -鉲 -睭 -鏌 -鼇 -郋 -魮 -朖 -滽 -渃 -滙 -熯 -醿 -鎅 -褀 -鬬 -巄 -螥 -眜 -釚 -柉 -壎 -峇 -姸 -唭 -鮜 -鈖 -嫈 -壄 -洤 -黃 -伕 -堦 -嶔 -鮰 -鞞 -漎 -鉓 -鮗 -壴 -阝 -妀 -矽 -獢 -倗 -銪 -鴓 -橒 -凈 -哖 -屚 -偍 -瑺 -媯 -淍 -驌 -椇 -赬 -薐 -糹 -碽 -濲 -釭 -晭 -纕 -寖 -閞 -歿 -呎 -鶆 -屄 -櫿 -犎 -旲 -㙟 -龎 -翜 -螾 -說 -衜 -泆 -軎 -鵂 -荎 -嚧 -硂 -桖 -褭 -筊 -鰷 -秳 -戩 -轀 -鬹 -飬 -卋 -暸 -狦 -搢 -娋 -鏴 -溫 -毉 -淰 -謩 -餺 -鵙 -鳽 -鮀 -狶 -氻 -轝 -妺 -袛 -蓭 -梂 -娛 -牼 -稅 -兿 -玾 -煚 -僩 -鶿 -鬄 -崠 -鉆 -鯓 -蚢 -庀 -鵟 -坣 -殼 -悞 -熅 -敻 -鍠 -曶 -愼 -搳 -姃 -砳 -槼 -臞 -韾 -靑 -鸊 -薲 -虛 -蠄 -啟 -鶺 -苺 -滾 -褞 -仺 -胇 -憻 -郳 -烉 -驩 -冇 -枖 -夌 -搵 -匸 -盨 -櫾 -霤 -麊 -貒 -噓 -嗢 -笩 -晈 -冂 -銳 -毿 -慜 -囧 -閜 -娸 -庢 -壆 -馯 -桱 -兗 -葃 -侅 -煐 -鐦 -藸 -鷎 -嵰 -逎 -弒 -匋 -鐭 -廔 -砩 -孆 -灴 -伷 -兪 -鴗 -澯 -幚 -旙 -勻 -礽 -婑 -鱮 -娍 -銶 -吳 -鍟 -仼 -鳧 -彞 -娽 -昛 -鰼 -剎 -佉 -鉏 -偸 -鰆 -讙 -橪 -啱 -岀 -孻 -釪 -乹 -鈳 -漇 -檦 -埻 -祿 -爌 -禇 -鱵 -㸃 -梉 -燝 -霙 -炁 -飮 -蠙 -勷 -鵎 -儥 -鐠 -唻 -廰 -嚿 -嵕 -墱 -紑 -搖 -瘜 -皝 -鸑 -瀁 -粵 -撚 -巑 -梀 -啯 -眛 -諴 -夊 -僙 -鍝 -裖 -鮣 -凬 -飡 -灊 -橓 -嫳 -筳 -咑 -粍 -瓑 -璌 -伃 -閰 -傜 -黐 -謢 -驒 -橫 -蛯 -寕 -蠵 -瞓 -旳 -翏 -硏 -寯 -韡 -楤 -鰃 -朿 -侞 -鵯 -愨 -祹 -厔 -丌 -盩 -謏 -魕 -啣 -閱 -曺 -枛 -罉 -卐 -樻 -鷉 -鯒 -鋡 -磱 -枱 -攴 -蠷 -穈 -嚟 -檽 -趐 -奐 -鋐 -檇 -薀 -峼 -咭 -訔 -韠 -鑴 -鸐 -唃 -捦 -鸜 -誴 -罳 -璄 -暃 -夀 -賨 -鞥 -鈊 -灡 -鮍 -懮 -籣 -昐 -陁 -襾 -鮠 -鈏 -囍 -婯 -艔 -貭 -䰾 -姁 -禼 -堖 -鋶 -仛 -鏷 -謜 -鑅 -忬 -蘶 -謠 -觙 -奫 -狟 -泩 -桙 -飈 -垰 -啍 -嚞 -鯕 -蒧 -榞 -徸 -璹 -揔 -欉 -魞 -菶 -玧 -鳯 -廍 -侚 -岰 -岧 -鋕 -凵 -彣 -崱 -媜 -倢 -鵐 -砋 -鷚 -鱠 -鮻 -繻 -摵 -贓 -磵 -錻 -痠 -粩 -胅 -奣 -塨 -瀠 -鸘 -啚 -娳 -霶 -壔 -峚 -甂 -廁 -覌 -鰂 -猳 -鱻 -盫 -裿 -杬 -歛 -澋 -蘞 -嵜 -尐 -旽 -鉌 -鎛 -豿 -凖 -榤 -禓 -龝 -悧 -鷟 -鮟 -吋 -喢 -岪 -吥 -漵 -頠 -豔 -巿 -鑨 -醣 -熳 -懍 -湥 -檡 -韺 -戱 -緖 -鐈 -凉 -緃 -鮹 -媐 -爯 -巆 -褍 -鐬 -昍 -扙 -鍳 -芛 -蟳 -嬅 -糬 -吔 -塭 -譿 -冧 -鏓 -嶪 -嗹 -椵 -姀 -閿 -褧 -錞 -玆 -笘 -篔 -萡 -鶡 -螐 -鮄 -鰟 -脷 -啲 -杤 -蓚 -尗 -娎 -殟 -淥 -蝚 -蓧 -彐 -嚤 -銍 -囒 -坶 -淩 -鶼 -鱂 -喼 -燫 -肏 -姵 -廌 -禟 -籝 -迵 -嵨 -堮 -蟌 -憍 -廕 -蜑 -緁 -唘 -竩 -崙 -璚 -粄 -栨 -罈 -梫 -貤 -藔 -蜯 -訁 -斖 -煶 -馦 -妠 -閟 -疕 -夆 -鎪 -膥 -澻 -嘢 -嚐 -靁 -鎻 -鰛 -穵 -烋 -縕 -褎 -疒 -壠 -溼 -圂 -咅 -鯭 -鯙 -磘 -玨 -珤 -朊 -蚼 -濶 -薞 -嚩 -丟 -嫺 -鯻 -椲 -鰕 -刂 -蠘 -踎 -瀴 -琁 -鰶 -瑴 -肜 -㐂 -欥 -媺 -竻 -讚 -𣇉 -裵 -緜 -廩 -齧 -叄 -俌 -厰 -滀 -錄 -鷫 -鯗 -攞 -姌 -蔝 -幷 -縤 -屻 -鯃 -雞 -纁 -嫲 -嵮 -屭 -嶃 -跩 -鋗 -蕢 -篊 -俬 -淎 -暻 -鏻 -憓 -玗 -溈 -笭 -糢 -勳 -閒 -沍 -咾 -鉷 -蘵 -俁 -崵 -毸 -苪 -掙 -鴡 -萭 -俴 -屜 -蒾 -艹 -剷 -慍 -朮 -枴 -氳 -猓 -甽 -箝 -譁 -贗 -迆 -鈽 -鍊 -鍰 -鏍 -靦 -餽 -丮 -丱 -仜 -仩 -伬 -伔 -仱 -伀 -伻 -佢 -佒 -侀 -侇 -佷 -佌 -佪 -侐 -侜 -俓 -侲 -俉 -侻 -侳 -俇 -倅 -倇 -倰 -倛 -倳 -倷 -俷 -倠 -偯 -偞 -偠 -偋 -偝 -偛 -偢 -偅 -偟 -偩 -偫 -傛 -傔 -傞 -傋 -傌 -傎 -傝 -偨 -傂 -傽 -傿 -僆 -傮 -僄 -僈 -傰 -僁 -傱 -僋 -僗 -僛 -僪 -僝 -僓 -僿 -儃 -儰 -僸 -僶 -僾 -儌 -僽 -儜 -儓 -儗 -儑 -儢 -儤 -儠 -儸 -儹 -儽 -冓 -冘 -冞 -凊 -凅 -凔 -刌 -刉 -刓 -刜 -刞 -刵 -刲 -剆 -刱 -剉 -剚 -剒 -剫 -剭 -剬 -剺 -剸 -剻 -剼 -劀 -劋 -劖 -劘 -劗 -劙 -劦 -勴 -匊 -匢 -匰 -匴 -匷 -匽 -卌 -卼 -厎 -厒 -厗 -厞 -厜 -厤 -厬 -厹 -吰 -吷 -吪 -呿 -咈 -呫 -呺 -呥 -呬 -呴 -茍 -咷 -咮 -咶 -哅 -咠 -咢 -唦 -唗 -唒 -哤 -唚 -唈 -哫 -唅 -唴 -啢 -唶 -啒 -啅 -唌 -唲 -喨 -喥 -喭 -噅 -喓 -喣 -啽 -喌 -嗃 -嗛 -嗋 -嗀 -喿 -喍 -嗏 -嗕 -嗈 -嘕 -嘒 -嗼 -嘐 -嘓 -嘂 -嗺 -嘝 -嘄 -嗿 -噈 -噊 -噆 -噚 -嘳 -嘽 -嘾 -噮 -噳 -噣 -噭 -噞 -嚌 -嚍 -嚃 -嚘 -嚜 -嚫 -嚪 -嚬 -嚲 -嚵 -嚽 -嚾 -囆 -囅 -囋 -囗 -圁 -圞 -圠 -坁 -坅 -坲 -坱 -垀 -坴 -垗 -垝 -垔 -垘 -垽 -垼 -埢 -埶 -堩 -堣 -塈 -堥 -塓 -塉 -塯 -塕 -塼 -墆 -塿 -塴 -墋 -塺 -墝 -墯 -壈 -墽 -壖 -壝 -壛 -壾 -壿 -夃 -夎 -夒 -夗 -奅 -奊 -奰 -奲 -奼 -妦 -妎 -妢 -妐 -妵 -姏 -姎 -㚷 -姡 -姺 -姼 -娭 -婐 -婟 -婥 -婓 -婗 -媔 -媟 -媢 -婸 -媦 -媥 -媬 -媕 -娷 -嫇 -嫋 -媰 -媻 -嫮 -嫥 -嫢 -嫛 -嫿 -嫴 -嫷 -嫶 -嬎 -嬓 -嬐 -嬲 -嬽 -孈 -屘 -孲 -孷 -宎 -宨 -寪 -寍 -寋 -寑 -寙 -寠 -寱 -尌 -尒 -尟 -尰 -尳 -屖 -屔 -屝 -屧 -屩 -屮 -屴 -岏 -岋 -岉 -岒 -岮 -岤 -岯 -岟 -岝 -峐 -峌 -峞 -峉 -峊 -峬 -峮 -峷 -崝 -崨 -崥 -崏 -崰 -崣 -崷 -嵃 -嵑 -崳 -崺 -嵂 -嵱 -嵣 -嵥 -嵞 -嶀 -嵽 -嶆 -嵺 -嵷 -嶊 -嶉 -嶈 -嵾 -嶕 -嶜 -嶡 -嶚 -嶞 -嶱 -嶩 -嶵 -嶭 -巃 -巏 -巕 -巟 -巹 -帊 -帗 -帟 -帣 -帠 -帤 -帩 -帾 -帴 -幏 -幎 -幓 -幩 -幝 -幠 -幧 -幨 -幦 -幭 -幰 -庂 -庉 -庌 -庈 -庰 -庛 -庣 -庨 -庮 -庪 -庬 -庴 -廅 -廇 -廘 -廗 -廎 -廜 -緳 -廦 -廥 -廮 -廯 -蠯 -廾 -弚 -弝 -弣 -弤 -弮 -弳 -彃 -彉 -彋 -彏 -彯 -彴 -彸 -彾 -徦 -徥 -徯 -徲 -徾 -徿 -忀 -忁 -忔 -忕 -忨 -忣 -忷 -忥 -怭 -怲 -怋 -怴 -怗 -怚 -怞 -怬 -怢 -怐 -怮 -怓 -怷 -怹 -恲 -恞 -恅 -恇 -恉 -恛 -恌 -恀 -恟 -悀 -悁 -悕 -悗 -悇 -悊 -悐 -悾 -悺 -惓 -惤 -惈 -悷 -惉 -悹 -惌 -惢 -惄 -愊 -愖 -愅 -惵 -愓 -惸 -惼 -惾 -慉 -慅 -愶 -愲 -愮 -愯 -愬 -慁 -慞 -慱 -慒 -慓 -慲 -憀 -慴 -慔 -慺 -慛 -憃 -慹 -憱 -憰 -憢 -憉 -憛 -憯 -憟 -憪 -憡 -憝 -憖 -懅 -憴 -懆 -懁 -憿 -憸 -憵 -憼 -懧 -懠 -懥 -懤 -懘 -懭 -懱 -懪 -懰 -懫 -懻 -戁 -戃 -戄 -戉 -戠 -酨 -戺 -扐 -扜 -扤 -扡 -扢 -抆 -抌 -抎 -抏 -扻 -抭 -抴 -拑 -抾 -抪 -抶 -抮 -挍 -挋 -挃 -拫 -拹 -挏 -挌 -拸 -挀 -拲 -捖 -挬 -挶 -揤 -捊 -挼 -挩 -捁 -挴 -捘 -捔 -捥 -掝 -掗 -掫 -掯 -捵 -掜 -捼 -掤 -掔 -掱 -揎 -揥 -揨 -揯 -揊 -揲 -揵 -摡 -揟 -揝 -揜 -揘 -揅 -揱 -搆 -搟 -搕 -搘 -搹 -搷 -搣 -搰 -搊 -搚 -摀 -搧 -搫 -摍 -摝 -摲 -摦 -摎 -摋 -摓 -摐 -摿 -摮 -摰 -撢 -撠 -撗 -撜 -撋 -撊 -撌 -撟 -擗 -擖 -擏 -擉 -撽 -擩 -擣 -擫 -擭 -擨 -擽 -擸 -攇 -攐 -攍 -攌 -攗 -攕 -攓 -攡 -攠 -攦 -攩 -攭 -攲 -攳 -敁 -敊 -敆 -敓 -敧 -敪 -敤 -敜 -敯 -敳 -敶 -敺 -敹 -敿 -斁 -斀 -斄 -斒 -斔 -斞 -斨 -斪 -斻 -旍 -旓 -旚 -旝 -旟 -昲 -昦 -昢 -晇 -晥 -晜 -晼 -晬 -暀 -暆 -暍 -暋 -暡 -暰 -暩 -曀 -曊 -曋 -曏 -曒 -曚 -曣 -曭 -朁 -朅 -朄 -朒 -朘 -朣 -朾 -朹 -朻 -朼 -杅 -杇 -杝 -杗 -枎 -杶 -枆 -枌 -柲 -枺 -枻 -柸 -柀 -柅 -柫 -柤 -柍 -柮 -柣 -柂 -柧 -栚 -桋 -桏 -栱 -栵 -栫 -栭 -栯 -栘 -栔 -梡 -梇 -梐 -桭 -梮 -楖 -梬 -梩 -桵 -梒 -椌 -椄 -棜 -棷 -棳 -棌 -椈 -楰 -棯 -椔 -棸 -楟 -楎 -楱 -楅 -楺 -楈 -楛 -楉 -楬 -椳 -楀 -楄 -楶 -楘 -榶 -槉 -榠 -榬 -榼 -榙 -榩 -榾 -榯 -槄 -榽 -榹 -槥 -槸 -樕 -樠 -槬 -槢 -樛 -樝 -槾 -樧 -槮 -樔 -槷 -橀 -樴 -橉 -橧 -樲 -橨 -橝 -橭 -橶 -樿 -橁 -檍 -檖 -檁 -檟 -橾 -檛 -檓 -檕 -檃 -櫅 -檹 -櫡 -櫠 -櫌 -櫑 -櫙 -櫋 -櫜 -櫐 -櫫 -櫬 -櫰 -櫹 -櫺 -櫼 -欃 -欋 -欈 -欐 -欑 -欘 -欨 -欴 -欯 -欭 -欱 -欶 -欳 -欷 -欿 -歂 -歈 -歍 -歋 -歕 -歔 -歜 -歠 -歭 -歾 -肂 -殈 -殏 -殔 -殗 -殙 -殠 -殥 -殢 -殦 -殧 -殰 -殶 -毃 -毄 -毈 -毇 -毊 -毚 -毞 -毦 -毤 -毨 -毣 -毰 -毲 -毻 -毼 -毾 -氁 -氀 -氄 -氠 -氶 -汃 -汒 -汏 -汍 -汸 -沋 -汱 -汯 -沕 -汦 -汳 -泬 -沶 -沬 -泧 -沷 -泭 -泲 -泒 -沴 -洟 -洊 -洀 -浺 -浶 -洍 -涒 -浘 -浢 -涊 -涆 -浧 -涗 -涳 -涬 -淢 -涷 -淔 -渀 -淈 -涾 -淊 -涽 -淭 -湆 -湇 -湅 -湢 -渿 -湁 -渜 -渳 -湀 -渻 -渮 -湨 -湡 -渱 -渨 -湠 -湱 -湩 -渹 -溛 -滖 -溓 -溔 -滒 -溰 -溾 -滜 -滵 -滱 -漃 -漥 -漮 -潎 -漙 -漧 -漘 -漒 -滭 -漊 -潳 -滮 -潀 -漰 -潃 -漅 -濆 -澒 -澅 -潚 -潠 -澖 -潶 -潬 -潒 -潐 -潗 -澓 -潝 -濇 -濎 -濈 -濄 -澞 -澨 -瀄 -濌 -澩 -濴 -濔 -濣 -濭 -濧 -濦 -瀇 -瀎 -濿 -瀀 -濻 -瀙 -瀖 -瀫 -瀡 -瀢 -瀩 -瀯 -瀷 -灂 -瀸 -瀿 -瀺 -灄 -灉 -灖 -灗 -灛 -灟 -灨 -灩 -灪 -炾 -炰 -烓 -烑 -缹 -焍 -烰 -焠 -焮 -焣 -煆 -煣 -煝 -熐 -熉 -熀 -熂 -熚 -燅 -燂 -熸 -燀 -燡 -爁 -爊 -爂 -爓 -爞 -爢 -爣 -牄 -牉 -牋 -牏 -牣 -牬 -牰 -牸 -牷 -犈 -犉 -犆 -犅 -犌 -犑 -犐 -犗 -犕 -犓 -犘 -犚 -犝 -犞 -犥 -犦 -犤 -犣 -犩 -犪 -犮 -犵 -犿 -狆 -狖 -狋 -狘 -狜 -狔 -狚 -狌 -狑 -狊 -狤 -狫 -狪 -狣 -猀 -狾 -猑 -猘 -猈 -狿 -猏 -猋 -猒 -猧 -猲 -猭 -猦 -猣 -猵 -猼 -獂 -獀 -獊 -獑 -獌 -獘 -獞 -獟 -獝 -獛 -獡 -獩 -獦 -獥 -獳 -獶 -獽 -獿 -玂 -玁 -玈 -玊 -玔 -珓 -珶 -琖 -瑵 -璊 -瑽 -璅 -瑿 -璗 -瓁 -瓋 -瓝 -瓟 -瓡 -瓥 -瓨 -瓬 -瓵 -瓾 -瓽 -甀 -甃 -甈 -甋 -甐 -甒 -甔 -甖 -甝 -甮 -甿 -畟 -畣 -畽 -疀 -疧 -痁 -疻 -痀 -痎 -痏 -痋 -痌 -痑 -痚 -痡 -痝 -痗 -痯 -瘏 -痷 -痸 -痻 -瘈 -瘑 -瘝 -瘣 -瘯 -瘱 -瘽 -癈 -癉 -癙 -癐 -癓 -癠 -癵 -癹 -皊 -皏 -皫 -皯 -皵 -皻 -皽 -皾 -盄 -盓 -盝 -盬 -盭 -盳 -眃 -眅 -盻 -眝 -眐 -眓 -眒 -眣 -眑 -眕 -眹 -眱 -眲 -眴 -眳 -眽 -睆 -睅 -睊 -睋 -睌 -睕 -睟 -睒 -睖 -睩 -睧 -睔 -瞁 -睼 -瞂 -睮 -睯 -瞏 -瞉 -瞚 -瞝 -瞡 -瞛 -瞲 -瞷 -瞶 -瞴 -矂 -矉 -矊 -矌 -矎 -矏 -矐 -矔 -矕 -矘 -矠 -矱 -矲 -矹 -矺 -砅 -砐 -砏 -砎 -砨 -硈 -硉 -硠 -硥 -硱 -硰 -硩 -碔 -碄 -碅 -碆 -硾 -碫 -碞 -磍 -磌 -磎 -磈 -磃 -磝 -磩 -磥 -磞 -磛 -磳 -磼 -磿 -礔 -礉 -礝 -礛 -礜 -礥 -礣 -礧 -礨 -礭 -礿 -祌 -祅 -祔 -祒 -祑 -祤 -祩 -祪 -祣 -祫 -祡 -祴 -祳 -禂 -禗 -禜 -禫 -禭 -禬 -禴 -禷 -禸 -歶 -秅 -秏 -秖 -秎 -秮 -秪 -秺 -秶 -稊 -稒 -稫 -穊 -稰 -稯 -穋 -穛 -穖 -穧 -穨 -穮 -穬 -穭 -穱 -穾 -窆 -窉 -窌 -窏 -窔 -窐 -窙 -窢 -窞 -窫 -窲 -窴 -窱 -窾 -竀 -竁 -竷 -笐 -笓 -笅 -笵 -笻 -笴 -笰 -笢 -笝 -笲 -筄 -筡 -箈 -箊 -箌 -箛 -箎 -箘 -箄 -箷 -箾 -篎 -箯 -箹 -篞 -篣 -篧 -篕 -篨 -篹 -簅 -篲 -篿 -篻 -簎 -篴 -簂 -簁 -篸 -篽 -簜 -簩 -簙 -簭 -簦 -簨 -簢 -簥 -簳 -簼 -簬 -簻 -籉 -籈 -籊 -籔 -籗 -籧 -籦 -籯 -籺 -籸 -籹 -粊 -粔 -粻 -糔 -糪 -糱 -糷 -紎 -紟 -紒 -紽 -紸 -紶 -紩 -絇 -紾 -絘 -絯 -絓 -絧 -絏 -絭 -絫 -綀 -綍 -絿 -綅 -絻 -絼 -綔 -綷 -緂 -綪 -緀 -緅 -緎 -緆 -緌 -綯 -綼 -緷 -緛 -緪 -緧 -縃 -緺 -緶 -緰 -縗 -縌 -縓 -縎 -縜 -縚 -縏 -縼 -繂 -縳 -顈 -繈 -縸 -縪 -繉 -繀 -縩 -緵 -縰 -縿 -縶 -繜 -繐 -繣 -繘 -繢 -繟 -繑 -繠 -繶 -繵 -繸 -繷 -繺 -繲 -繴 -纀 -纇 -纋 -纆 -纑 -纗 -纚 -缿 -罊 -罏 -罜 -罞 -罝 -罛 -罣 -罥 -罦 -罭 -罫 -罬 -罻 -罼 -罺 -罿 -羃 -羉 -羍 -羒 -羜 -羛 -羢 -羠 -羦 -羬 -羭 -羵 -羳 -羷 -羺 -羾 -翋 -翍 -翐 -翑 -翇 -翢 -翣 -翭 -翪 -翨 -翴 -翲 -翽 -翿 -耟 -耞 -耡 -耴 -耾 -耹 -聇 -聈 -聑 -聏 -聝 -肕 -肙 -肒 -肣 -肵 -胘 -胑 -胐 -胕 -胉 -胏 -胹 -胵 -脁 -胻 -脀 -胾 -胔 -脰 -脥 -脤 -脙 -脡 -脕 -脧 -腃 -腏 -腄 -腇 -脽 -腍 -腤 -腷 -腜 -腛 -腢 -腲 -朡 -腞 -腶 -膉 -膆 -膃 -膇 -膍 -膌 -膋 -膟 -膕 -膢 -膱 -膹 -膫 -膰 -膬 -膴 -膲 -臇 -膷 -臄 -臅 -臒 -臐 -臗 -臛 -臡 -臦 -臩 -臮 -臲 -臷 -臸 -臿 -舋 -舑 -舕 -舝 -舡 -舼 -舽 -艀 -艂 -艓 -艒 -艐 -艑 -艕 -艛 -艵 -艼 -芀 -芐 -芅 -芓 -芔 -苀 -芚 -芵 -芧 -芞 -芺 -苙 -苨 -苖 -苬 -苲 -苵 -苶 -茙 -茥 -茿 -茦 -茢 -荂 -茪 -荍 -茖 -茤 -茠 -茩 -茻 -莐 -莣 -莍 -荺 -莤 -荴 -莏 -莁 -荵 -莔 -莃 -莌 -莋 -荾 -莥 -菨 -萒 -菧 -菤 -菆 -菣 -菿 -菋 -菎 -菵 -萉 -菞 -菳 -菕 -蓱 -萿 -葹 -葥 -葀 -葧 -萰 -葍 -葽 -蔇 -葞 -萷 -萺 -萴 -葅 -菙 -葋 -萯 -葂 -葟 -葌 -蓎 -蒬 -蒮 -蒫 -蒪 -蒚 -蒝 -蓌 -蒛 -蒩 -蒘 -蒶 -蒠 -蔤 -蔏 -蔩 -蔉 -蔍 -蔧 -蔜 -蓻 -蓺 -蓴 -蔪 -蓲 -蓷 -蓫 -蔒 -蓩 -蔖 -蓾 -蔨 -蔮 -蔂 -蓶 -蔱 -蓹 -蔠 -蔰 -蕫 -蕍 -蕀 -蕆 -蕄 -蕇 -蕣 -蕛 -蕱 -蕵 -蕮 -蕧 -蕠 -蕦 -蕝 -薃 -薧 -薕 -薠 -薋 -薣 -薚 -蕼 -薉 -蕸 -薎 -薖 -薍 -薝 -薂 -藆 -藀 -藃 -藂 -薵 -薽 -藇 -藄 -藋 -藈 -藅 -薱 -薶 -藒 -藫 -藱 -藙 -藡 -藚 -藗 -藲 -藬 -藘 -藣 -藑 -藰 -蘁 -藾 -蘛 -蘉 -蘌 -蘪 -蘦 -蘟 -蘣 -蘜 -蘙 -蘮 -蘡 -蘠 -蘥 -蘴 -蘳 -蘬 -虀 -蘹 -蘱 -蘻 -蘾 -虃 -虆 -虇 -虈 -虌 -虋 -虙 -虡 -虣 -虩 -虪 -虰 -虭 -虴 -蚑 -蚞 -蚇 -蚗 -蚚 -蚅 -蚥 -蚙 -蚿 -蚷 -蛂 -蛁 -蛅 -蛈 -蚹 -蚳 -蚸 -蛌 -蚻 -蛢 -蛦 -蛓 -蛣 -蛚 -蛪 -蛝 -蛫 -蛜 -蛬 -蛗 -蜄 -蛷 -蜌 -蛖 -蛵 -蜁 -蛶 -蜳 -蝫 -蜙 -蝃 -蜬 -蝁 -蝆 -蜠 -蜲 -蜪 -蜭 -蜼 -蜵 -蝂 -蜦 -蜧 -蜸 -蜤 -蜰 -蝖 -蝷 -蟡 -蝳 -蝔 -蝛 -蝒 -蝑 -蝞 -蝭 -蝪 -蝐 -蝝 -蝬 -蝺 -蝜 -螛 -螏 -螓 -螒 -螁 -螖 -螘 -蝹 -螇 -螑 -螝 -螜 -螚 -螪 -螰 -螹 -螼 -螮 -蟉 -蟃 -蟂 -螷 -螴 -螿 -螸 -蟞 -蟧 -蟦 -蟢 -蟟 -蟤 -蟔 -蟓 -蟭 -蟘 -螤 -蟗 -蟙 -蠁 -蟨 -蠀 -蟺 -蠉 -蠌 -蟼 -蠈 -蟿 -蠗 -蠩 -蠝 -蠛 -蠠 -蠤 -蠜 -蠫 -蠬 -蠨 -蠦 -蠪 -蠥 -蠰 -蠮 -蠳 -蠸 -蠾 -蠽 -蠿 -衁 -衈 -衋 -衧 -衪 -衭 -衶 -袀 -衱 -衯 -袃 -袉 -袕 -袨 -袚 -袑 -袡 -袘 -袧 -袬 -袌 -袺 -裗 -袹 -袸 -裀 -袶 -袽 -袲 -裋 -裍 -裞 -裚 -裷 -裧 -裺 -裮 -裶 -裯 -裻 -褁 -褅 -褋 -褗 -褆 -褖 -褑 -褦 -褮 -褱 -褢 -褩 -褵 -褼 -褾 -襒 -褷 -襂 -褽 -襓 -襋 -襆 -襐 -襛 -襗 -襡 -襘 -襝 -襣 -襭 -襩 -襮 -襳 -襹 -襺 -覂 -覅 -覕 -覛 -覝 -覢 -覤 -覣 -覭 -覮 -覶 -觓 -觤 -觡 -觠 -觢 -觩 -觰 -觬 -觲 -觷 -觺 -觻 -觼 -觾 -訑 -訰 -訧 -訬 -訞 -詍 -訹 -詙 -詀 -詄 -詅 -訿 -誂 -詻 -誃 -誫 -誙 -誋 -諆 -誸 -諔 -諕 -誻 -諀 -諅 -諵 -諝 -諰 -諈 -謞 -謘 -謑 -謋 -謒 -謕 -謍 -謈 -謪 -謧 -謣 -謰 -謵 -譇 -謯 -謱 -謥 -謷 -謦 -譐 -譈 -譊 -譀 -譋 -譕 -譑 -譠 -譪 -譝 -譨 -譣 -譥 -譹 -譸 -譅 -譺 -譻 -譾 -讄 -讂 -讆 -讋 -讔 -讘 -讟 -谹 -谻 -谽 -谾 -豃 -豋 -豍 -豏 -豗 -豜 -豝 -豟 -豥 -豤 -豦 -豭 -豰 -豲 -豱 -豯 -豵 -豷 -豶 -豻 -豽 -貁 -貀 -貄 -貏 -貑 -貕 -貙 -貗 -貜 -貣 -貾 -賌 -賥 -賟 -賙 -賵 -賮 -贆 -贕 -贙 -赨 -赩 -赮 -赸 -趀 -趌 -趎 -趏 -趍 -趓 -趠 -趜 -趡 -趥 -趧 -趬 -趪 -趭 -趫 -趮 -趷 -趹 -跘 -跓 -跍 -跇 -跜 -跕 -跙 -跈 -跰 -跠 -跮 -跦 -跢 -跧 -跲 -跫 -踂 -跿 -踍 -踃 -踇 -踆 -跾 -踠 -踥 -踤 -踡 -踕 -踛 -踖 -踑 -踙 -踧 -踘 -踓 -踳 -踾 -踸 -踼 -蹎 -蹍 -蹓 -蹗 -蹖 -蹞 -蹥 -蹛 -蹡 -蹝 -蹔 -蹸 -蹳 -蹪 -躆 -躈 -躖 -躗 -躟 -躠 -躤 -躣 -躩 -躨 -躽 -軓 -軘 -軞 -軯 -軷 -軦 -軮 -軥 -軵 -軧 -軨 -軶 -軱 -軬 -輆 -軿 -輁 -輀 -輂 -輐 -輑 -輤 -輘 -輚 -輠 -輣 -輖 -輗 -輮 -輵 -輲 -輹 -輷 -輴 -轃 -轇 -轈 -轒 -轑 -轏 -轐 -轓 -轙 -轖 -轗 -轕 -轚 -轞 -轛 -轠 -辴 -迉 -迒 -迋 -迍 -迖 -迣 -迡 -迾 -迿 -逜 -逿 -遝 -遳 -遰 -遻 -邆 -邅 -遾 -邍 -邔 -邟 -邥 -邞 -邧 -郱 -郕 -郖 -郠 -郙 -郣 -郥 -郘 -郰 -郲 -郔 -鄬 -郼 -鄈 -郹 -郻 -鄁 -鄇 -郺 -鄐 -鄍 -鄏 -鄎 -鄟 -鄝 -鄡 -鄛 -鄨 -鄪 -鄦 -鄮 -鄵 -鄸 -鄻 -鄾 -酀 -酁 -酄 -酇 -酖 -酘 -酓 -酟 -酳 -醆 -醊 -醓 -醙 -醟 -醥 -醧 -醰 -醱 -醷 -醲 -醳 -醹 -醽 -釂 -釃 -釢 -釱 -釳 -釸 -鈚 -鈌 -鈒 -釽 -鈆 -鉒 -鉠 -鉯 -鈶 -鉼 -銤 -銛 -銔 -鉹 -銗 -鋄 -鋀 -鋟 -鋘 -鋩 -鋝 -鋂 -鋊 -錧 -錼 -錭 -錎 -鋋 -鎡 -鎃 -鎯 -鍖 -鍜 -鍐 -鍭 -鍌 -鎒 -鎷 -鎝 -鎉 -鎎 -鎞 -鏏 -鏂 -鏚 -鏬 -鏙 -鐋 -鐏 -鏾 -鐕 -鐨 -鐍 -鐀 -鐎 -鐖 -鐻 -鐶 -鑐 -鑋 -鑕 -鑮 -鑯 -钂 -钀 -钁 -钃 -镺 -镻 -镼 -镽 -閈 -閍 -閺 -閵 -闀 -闉 -闅 -閷 -闒 -闑 -闚 -闛 -闠 -闟 -闤 -阞 -阢 -阤 -阠 -阰 -阹 -阸 -阺 -陏 -陓 -陊 -陼 -陭 -陫 -隇 -陾 -隉 -隒 -隓 -隞 -隤 -隿 -雂 -雈 -雓 -雔 -雗 -雚 -雟 -雘 -雺 -雽 -雿 -霂 -霋 -霒 -霐 -霠 -霣 -霢 -霩 -霫 -霬 -霮 -霵 -霿 -靆 -靃 -靪 -靮 -靷 -靲 -靾 -鞃 -鞀 -鞂 -靻 -鞊 -鞎 -鞈 -鞙 -鞗 -鞚 -鞜 -鞤 -鞪 -鞷 -鞶 -鞹 -鞻 -鞿 -韄 -韅 -韇 -韎 -韐 -韏 -韕 -韔 -韗 -韝 -韟 -韣 -韥 -韰 -韱 -韹 -韽 -頄 -頖 -頞 -頝 -頩 -頨 -頯 -頲 -顁 -顄 -顊 -顉 -顅 -顐 -顑 -顜 -顝 -顠 -顣 -顟 -顤 -顪 -顩 -顲 -颬 -颲 -颸 -颽 -颻 -颾 -飁 -飂 -飉 -飋 -飌 -飣 -飶 -餂 -餀 -飺 -餔 -餖 -餕 -餤 -餟 -餥 -餫 -餪 -餲 -餯 -餭 -餱 -餰 -饁 -饇 -饐 -饎 -饙 -饘 -饛 -饡 -馣 -馲 -馰 -馵 -馻 -馺 -駂 -馽 -駜 -駍 -駏 -駎 -駖 -駮 -駬 -駥 -駤 -駣 -駩 -駺 -駴 -駷 -駹 -駶 -駻 -駽 -駾 -騃 -騉 -騑 -騊 -騇 -騚 -騕 -騥 -騝 -騛 -騢 -騠 -騧 -騞 -騜 -騵 -騲 -騴 -騱 -騬 -騪 -騩 -騹 -騽 -驆 -騺 -驓 -驔 -驈 -驉 -驖 -驞 -驠 -驦 -驨 -骭 -骫 -骹 -骿 -骴 -骾 -髇 -髊 -髆 -髍 -髐 -髟 -髧 -髬 -髳 -髶 -髺 -髾 -鬁 -髼 -鬋 -鬊 -鬎 -鬌 -鬐 -鬕 -鬗 -鬖 -鬙 -鬞 -鬠 -鬤 -鬫 -鬳 -鬵 -鬺 -鬾 -鬿 -魊 -魌 -魖 -魠 -魡 -魧 -魱 -魦 -魶 -魵 -鮅 -鮇 -魼 -魾 -魻 -鮂 -鮚 -鮞 -鮛 -鮦 -鮥 -鮤 -鮆 -鯆 -鮿 -鮵 -鯈 -鯫 -鯠 -鯞 -鯦 -鯬 -鰌 -鰋 -鰅 -鯸 -鰫 -鰝 -鰬 -鱆 -鰿 -鱄 -鱁 -鰴 -鱐 -鱍 -鱋 -鱕 -鱦 -鱢 -鱞 -鱴 -鱳 -鱹 -鳦 -鳪 -鳭 -鳱 -鳵 -鳼 -鳺 -鳿 -鳷 -鴀 -鳹 -鳻 -鴅 -鴃 -鴥 -鴠 -鴔 -鴩 -鴘 -鴢 -鴐 -鴳 -鵁 -鵧 -鴶 -鴮 -鴱 -鴸 -鵅 -鵃 -鴾 -鵀 -鴽 -鵏 -鵊 -鵛 -鵋 -鵖 -鵌 -鵗 -鵔 -鵷 -鶁 -鶊 -鶄 -鶈 -鵱 -鶀 -鵸 -鶋 -鶌 -鵽 -鵫 -鵴 -鵩 -鶅 -鵳 -鵻 -鶂 -鵹 -鶟 -鶙 -鶤 -鶝 -鶐 -鶛 -鶠 -鶔 -鶜 -鶪 -鶗 -鶢 -鶨 -鶞 -鶣 -鶖 -鶷 -鶶 -鷁 -鷇 -鷊 -鷏 -鶾 -鷅 -鷃 -鶵 -鷈 -鶱 -鶭 -鷛 -鷒 -鷞 -鷋 -鷐 -鷜 -鷑 -鷩 -鷘 -鷖 -鷵 -鷕 -鷻 -鷷 -鷣 -鷤 -鷶 -鷡 -鷮 -鷢 -鸂 -鷾 -鸇 -鸃 -鸆 -鸅 -鸀 -鸁 -鸉 -鷿 -鷽 -鸄 -鸋 -鸍 -鸏 -鸒 -鸔 -鸓 -鸗 -鸙 -鹺 -麃 -麆 -麉 -麎 -麌 -麔 -麙 -麛 -麚 -麜 -麠 -麡 -麧 -麮 -麰 -麶 -麷 -黀 -黂 -黈 -黓 -黕 -黖 -黚 -黤 -黫 -黮 -黭 -黰 -黳 -黵 -黺 -鼁 -鼀 -鼆 -鼊 -鼏 -鼖 -鼛 -鼘 -鼜 -鼤 -鼣 -鼥 -鼪 -鼨 -鼭 -鼰 -鼮 -鼵 -鼳 -鼲 -鼸 -鼶 -齀 -齂 -齃 -齌 -齍 -齎 -齖 -齗 -齘 -齛 -齠 -齞 -齝 -齥 -齤 -齫 -齱 -齰 -齮 -齯 -齴 -齵 -齸 -齻 -齺 -齹 -齾 -龒 -龤 -堔 -礂 -蒏 -蒆 -兙 -兛 -兞 -兝 -兡 -兣 -嗧 -瓩 -忼 -擡 -氊 -穇 -擧 -譌 -! -" -# -$ -% -& -' -( -) -* -+ -, -- -. -/ -0 -1 -2 -3 -4 -5 -6 -7 -8 -9 -: -; -< -= -> -? -A -B -C -D -E -F -G -H -I -J -K -L -M -N -O -P -Q -R -S -T -U -V -W -X -Y -Z -[ -] -_ -` -a -b -c -d -e -f -g -h -i -j -k -l -m -n -o -p -q -r -s -t -u -v -w -x -y -z -© -° -² -´ -½ -Á -Ä -Å -Ç -È -É -Í -Ó -Ö -× -Ü -ß -à -á -â -ã -ä -å -æ -ç -è -é -ê -ë -í -ð -ñ -ò -ó -ô -õ -ö -ø -ú -û -ü -ý -ā -ă -ą -ć -Č -č -đ -ē -ė -ę -ğ -ī -ı -Ł -ł -ń -ň -ō -ř -Ş -ş -Š -š -ţ -ū -ż -Ž -ž -Ș -ș -ț -Δ -α -λ -μ -φ -Г -О -а -в -л -о -р -с -т -я -ồ -— -― -’ -“ -” -… -℃ -→ -∇ -− -■ -☆ -、 -。 -々 -〆 -〈 -〉 -「 -」 -『 -』 -〔 -〕 -〜 -! -# -% -& -( -) -+ -, -- -. -/ -0 -1 -2 -3 -4 -5 -6 -7 -8 -9 -: -; -= -? -@ -A -B -C -D -E -F -G -H -I -J -K -L -M -N -O -P -R -S -T -U -V -W -X -Z -a -b -c -d -e -f -g -h -i -j -k -l -m -n -o -p -q -r -s -t -u -v -w -x -y -z -~ -・ -ǎ -ǒ -ě -ǐ -ì -ǔ -ù -ǖ -ǘ -ǚ -ǜ -【 -】 -《 -》 -‥ -{ -} -\ -| -@ -^ -~ -÷ -∕ -∙ -⋅ -· -⊕ -⊖ -⊗ -⊘ -⊙ -± -∓ -∩ -∪ -□ -⊎ -⊓ -⊔ -≠ -≈ -≡ -≤ -≥ -≪ -≫ -≲ -≳ -≶ -≷ -≺ -≻ -≼ -≽ -∈ -∉ -⊂ -⊃ -⊆ -⊇ -⊄ -⊅ -∅ -∖ -∁ -∆ -∧ -∨ -¬ -⊻ -⊼ -⊽ -← -↔ -⇒ -⇐ -⇔ -∀ -∃ -∄ -∴ -∵ -∝ -∞ -⊥ -∟ -∠ -∡ -∢ -′ -″ -∥ -⊾ -⊿ -∂ -∫ -∬ -∭ -∮ -∯ -∰ -∑ -∏ -√ -∛ -∜ -∱ -∲ -∳ -∶ -∷ -∼ -® -≄ -≅ -≃ -≦ -≧ -⊈ -⊉ -⊢ -⊤ -⊨ -⊧ -℉ -Ω -℧ -Å -⌀ -ℏ -⅀ -⍺ -⍵ -¢ -€ -£ -¥ -¥ -₿ -↑ -↓ -↕ -↖ -↗ -↘ -↙ -↺ -↻ -↼ -↽ -↾ -↿ -⇀ -⇁ -⇂ -⇃ -⇋ -⇌ -ª -º -⁰ -¹ -³ -⁴ -⁵ -⁶ -⁷ -⁸ -⁹ -⁺ -⁻ -⁼ -⁽ -⁾ -ⁿ -₀ -₁ -₂ -₃ -₄ -₅ -₆ -₇ -₈ -₉ -₊ -₋ -₌ -₍ -₎ -Ⅰ -Ⅱ -Ⅲ -Ⅳ -Ⅴ -Ⅵ -Ⅶ -Ⅷ -Ⅸ -Ⅹ -Ⅺ -Ⅻ -ⅰ -ⅱ -ⅲ -ⅳ -ⅴ -ⅵ -ⅶ -ⅷ -ⅸ -ⅹ -ⅺ -ⅻ -☰ -☱ -☲ -☳ -☴ -☵ -☶ -☷ -♀ -♂ -♳ -♴ -♵ -♶ -♷ -♸ -♹ -♺ -♩ -♪ -♫ -♬ -⚪ -⚫ -⚬ -✶ -✷ -✸ -➀ -➁ -➂ -➃ -➄ -➅ -➆ -➇ -➈ -➉ -➊ -➋ -➌ -➍ -➎ -➏ -➐ -➑ -➒ -➓ -⏀ -⏁ -⏂ -⏃ -⏄ -⏅ -⏆ -⏇ -⏈ -⏉ -⏊ -⏋ -⏌ -⏚ -⏴ -⏵ -⏶ -⏷ -⏸ -⏹ -⏺ -⏻ -⏼ -Α -Β -Γ -Ε -Ζ -Η -Θ -Ι -Κ -Λ -Μ -Ν -Ξ -Ο -Π -Ρ -Σ -Τ -Υ -Φ -Χ -Ψ -β -γ -δ -ε -ζ -η -θ -ι -κ -ν -ξ -ο -π -ρ -σ -τ -υ -χ -ψ -ω -ϐ -ϑ -ϒ -ϕ -█ -ϖ -ϰ -ϱ -ϴ -ϵ -ϝ -Ϟ -ϟ -Ϡ -ϡ -Ϣ -ϣ -Ϥ -ϥ -Ϧ -ϧ -Ϩ -ϩ -Ϫ -ϫ -Ϭ -ϭ -Ϯ -ϯ -∸ -∹ -∺ -∻ -∽ -∾ -∿ -≀ -≁ -≂ -≆ -≇ -≉ -≊ -≋ -≌ -≍ -≎ -≏ -≐ -≑ -≒ -≓ -≔ -≕ -≖ -≗ -≘ -≙ -≚ -≛ -≜ -≝ -≞ -≟ -≢ -≣ -≨ -≩ -≬ -≭ -≮ -≯ -≰ -≱ -≴ -≵ -≸ -≹ -≾ -≿ -⊀ -⊁ -⊊ -⊋ -⊌ -⊍ -⊏ -⊐ -⊑ -⊒ -⊚ -⊛ -⊜ -⊝ -⊞ -⊟ -⊠ -⊡ -⊣ -⊦ -⊩ -⊪ -⊫ -⊬ -⊭ -⊮ -⊯ -⊰ -⊱ -⊲ -⊳ -⊴ -⊵ -⊶ -⊷ -⊸ -⊹ -⊺ -ℎ -℘ -ℜ -ℑ -ℵ -ℶ -ℷ -ℸ -⌬ -⌭ -⌮ -⌯ -⎔ -¤ -₠ -₡ -₢ -₣ -₤ -₥ -₦ -₧ -₨ -₩ -₪ -₫ -₭ -₮ -₯ -₰ -₱ -₲ -₳ -₴ -₵ -₶ -₷ -₸ -₹ -₺ -₻ -₼ -₽ -₾ -↚ -↛ -↜ -↝ -↞ -↟ -↠ -↡ -↢ -↣ -↤ -↥ -↦ -↧ -↨ -↩ -↪ -↫ -↬ -↭ -↮ -↯ -↰ -↱ -↲ -↳ -↴ -↵ -↶ -↷ -↸ -↹ -⇄ -⇅ -⇆ -⇇ -⇈ -⇉ -⇊ -⇍ -⇎ -⇏ -⇑ -⇓ -⇕ -⇖ -⇗ -⇘ -⇙ -⇚ -⇛ -⇜ -⇝ -⇞ -⇟ -⇠ -⇡ -⇢ -⇣ -⇤ -⇥ -⇦ -⇧ -⇨ -⇩ -⇪ -⇫ -⇬ -⇭ -⇮ -⇯ -⇰ -⇱ -⇲ -⇳ -⇴ -⇵ -⇶ -⇷ -⇸ -⇹ -⇺ -⇻ -⇼ -⇽ -⇾ -⇿ -ↀ -ↁ -ↂ -☀ -☁ -☂ -☃ -☄ -★ -☇ -☈ -☉ -☊ -☋ -☌ -☍ -☎ -☏ -☐ -☑ -☒ -☓ -☔ -☕ -☖ -☗ -☘ -☙ -☚ -☛ -☜ -☝ -☞ -☟ -☠ -☡ -☢ -☣ -☤ -☥ -☦ -☧ -☨ -☩ -☪ -☫ -☬ -☭ -☮ -☯ -☸ -☹ -☺ -☻ -☼ -☽ -☾ -☿ -♁ -♃ -♄ -♅ -♆ -♇ -♔ -♕ -♖ -♗ -♘ -♙ -♚ -♛ -♜ -♝ -♞ -♟ -♠ -♡ -♢ -♣ -♤ -♥ -♦ -♧ -♨ -♭ -♮ -♯ -♰ -♱ -♲ -♻ -♼ -♽ -♾ -⚀ -⚁ -⚂ -⚃ -⚄ -⚅ -⚆ -⚇ -⚈ -⚉ -⚊ -⚋ -⚌ -⚍ -⚎ -⚏ -⚐ -⚑ -⚒ -⚓ -⚔ -⚕ -⚖ -⚗ -⚘ -⚙ -⚚ -⚛ -⚜ -⚝ -⚞ -⚟ -⚠ -⚡ -⚢ -⚣ -⚤ -⚥ -⚦ -⚧ -⚨ -⚩ -⚭ -⚮ -⚯ -⚰ -⚱ -⚲ -⚳ -⚴ -⚵ -⚶ -⚷ -⚸ -⚹ -⚺ -⚻ -⚼ -⚿ -⛀ -⛁ -⛂ -⛃ -⛆ -⛇ -⛈ -⛉ -⛊ -⛋ -⛌ -⛍ -⛏ -⛐ -⛑ -⛒ -⛓ -⛕ -⛖ -⛗ -⛘ -⛙ -⛚ -⛛ -⛜ -⛝ -⛞ -⛠ -⛡ -⛢ -⛣ -⛤ -⛥ -⛦ -⛧ -⛨ -⛩ -⛪ -⛫ -⛬ -⛭ -⛮ -⛯ -⛶ -⛾ -⛿ -✆ -✇ -✈ -✉ -✌ -✍ -✎ -✏ -✐ -✑ -✒ -✓ -✔ -✕ -✙ -✚ -✛ -✜ -✝ -✞ -✟ -✠ -✡ -✢ -✣ -✤ -✥ -✦ -✧ -✩ -✪ -✫ -✬ -✭ -✮ -✯ -✰ -✱ -✲ -✳ -✴ -✵ -✹ -✺ -✻ -✼ -✽ -✾ -✿ -❀ -❁ -❂ -❃ -❄ -❅ -❆ -❇ -❈ -❉ -❊ -❋ -❍ -❏ -❐ -❑ -❒ -❖ -❘ -❙ -❚ -❛ -❜ -❝ -❞ -❡ -❢ -❣ -❤ -❥ -❦ -❧ -❨ -❩ -❪ -❫ -❬ -❭ -❮ -❯ -❰ -❱ -❲ -❳ -❴ -❵ -❶ -❷ -❸ -❹ -❺ -❻ -❼ -❽ -❾ -❿ -① -② -③ -④ -⑤ -⑥ -⑦ -⑧ -⑨ -⑩ -➔ -➕ -➖ -➗ -➘ -➙ -➚ -➛ -➜ -➝ -➞ -➟ -➠ -➡ -➢ -➣ -➤ -➥ -➦ -➧ -➨ -➩ -➪ -➫ -➬ -➭ -➮ -➯ -➰ -➱ -➲ -➳ -➴ -➵ -➶ -➷ -➸ -➹ -➺ -➻ -➼ -➽ -➾ -➿ -⌘ -⌥ -⌃ -⎋ -⌫ -⌦ -⏏ -⌤ -⌧ -⌨ -⎆ -⎇ -⎈ -⎉ -⎊ -⎌ -⎍ -⎎ -⎏ -⎐ -⎑ -⎒ -⎓ -⎕ -⎖ -⎗ -⎘ -⎙ -⎚ -⎛ -⎜ -⎝ -⎞ -⎟ -⎠ -⎡ -⎢ -⎣ -⎤ -⎥ -⎦ -⎧ -⎨ -⎩ -⎪ -⎫ -⎬ -⎭ -⎮ -⎯ -⎰ -⎱ -⎲ -⎳ -⎴ -⎵ -⎶ -⎷ -⎸ -⎹ -⎺ -⎻ -⎼ -⎽ -⎾ -⎿ -⏍ -⏎ -⏐ -⏑ -⏒ -⏓ -⏔ -⏕ -⏖ -⏗ -⏘ -⏙ -⏛ -⏜ -⏝ -⏞ -⏟ -⏠ -⏡ -⏢ -⏣ -⏤ -⏥ -⏦ -⏧ -⏨ -⏭ -⏮ -⏯ -⏱ -⏲ -▲ -▽ -◐ -⏽ -⏾ -⏿ -ɐ -ɑ -ɒ -ɓ -ɔ -ɕ -ɖ -ɗ -ɘ -ə -ɚ -ɛ -ɜ -ɝ -ɞ -ɟ -ɠ -ɡ -ɢ -ɣ -ɤ -ɥ -ɦ -ɧ -ɨ -ɩ -ɪ -ɫ -ɬ -ɭ -ɮ -ɯ -ɰ -ɱ -ɲ -ɳ -ɴ -ɵ -ɶ -ɷ -ɸ -ɹ -ɺ -ɻ -ɼ -ɽ -ɾ -ɿ -ʀ -ʁ -ʂ -ʃ -ʄ -ʅ -ʆ -ʇ -ʈ -ʉ -ʊ -ʋ -ʌ -ʍ -ʎ -ʏ -ʐ -ʑ -ʒ -ʓ -ʔ -ʕ -ʖ -ʗ -ʘ -ʙ -ʚ -ʛ -ʜ -ʝ -ʞ -ʟ -ʠ -ʡ -ʢ -ʣ -ʤ -ʥ -ʦ -ʧ -ʨ -ʩ -ʪ -ʫ -ʬ -ʭ -ʮ -ʯ -━ -Ǝ -à -● -▶ -| -𝑢 -〖 -〗 -︽ -– -﹥ -𝜓 -• -∋ -ƒ -० -✘ -Е -◉ -〒 -𝒱 -𝜆 -⟹ -﹪ -◊ -╆ -오 -˂ -〉 -𝝎 -▪ -△ -▁ -◼ -〇 -▷ -▬ -𝒮 -† -ₒ -⼁ -〵 -⭐ -╳ -⟶ -으 -⬆ -Ạ -◀ - -▫ -丄 -︾ -◥ -‖ -𝜌 -ⅼ -▼ -⁎ -﹏ -😁 -😂 -😃 -😄 -😅 -😆 -😉 -😊 -😋 -😌 -😍 -😏 -😒 -😓 -😔 -😖 -😘 -😚 -😜 -😝 -😞 -😠 -😡 -😢 -😣 -😤 -😥 -😨 -😩 -😪 -😫 -😭 -😰 -😱 -😲 -😳 -😵 -😷 -😸 -😹 -😺 -😻 -😼 -😽 -😾 -😿 -🙀 -🙅 -🙆 -🙇 -🙈 -🙉 -🙊 -🙋 -🙌 -🙍 -🙎 -🙏 -✂ -✅ -✊ -✋ -✖ -✨ -❌ -❎ -❓ -❔ -❕ -❗ -🚀 -🚃 -🚄 -🚅 -🚇 -🚉 -🚌 -🚏 -🚑 -🚒 -🚓 -🚕 -🚗 -🚙 -🚚 -🚢 -🚤 -🚥 -🚧 -🚨 -🚩 -🚪 -🚫 -🚬 -🚭 -🚲 -🚶 -🚹 -🚺 -🚻 -🚼 -🚽 -🚾 -🛀 -Ⓜ -🅰 -🅱 -🅾 -🅿 -🆎 -🆑 -🆒 -🆓 -🆔 -🆕 -🆖 -🆗 -🆘 -🆙 -🆚 -🇩🇪 -🇬🇧 -🇨🇳 -🇯🇵 -🇫🇷 -🇰🇷 -🇪🇸 -🇮🇹 -🇷🇺 -🇺🇸 -🈁 -ℹ -⌚ -⌛ -⏩ -⏪ -⏫ -⏬ -⏰ -⏳ -◻ -◽ -◾ -♈ -♉ -♊ -♋ -♌ -♍ -♎ -♏ -♐ -♑ -♒ -♓ -♿ -⚽ -⚾ -⛄ -⛅ -⛎ -⛔ -⛲ -⛳ -⛵ -⛺ -⛽ -⤴ -⤵ -⬅ -⬇ -⬛ -⬜ -⭕ -〰 -〽 -㊗ -㊙ -🀄 -🃏 -🌀 -🌁 -🌂 -🌃 -🌄 -🌅 -🌆 -🌇 -🌈 -🌉 -🌊 -🌋 -🌌 -🌏 -🌑 -🌓 -🌔 -🌕 -🌙 -🌛 -🌟 -🌠 -🌰 -🌱 -🌴 -🌵 -🌷 -🌸 -🌹 -🌺 -🌻 -🌼 -🌽 -🌾 -🌿 -🍀 -🍁 -🍂 -🍃 -🍄 -🍅 -🍆 -🍇 -🍈 -🍉 -🍊 -🍌 -🍍 -🍎 -🍏 -🍑 -🍒 -🍓 -🍔 -🍕 -🍖 -🍗 -🍘 -🍙 -🍚 -🍛 -🍜 -🍝 -🍞 -🍟 -🍠 -🍡 -🍢 -🍣 -🍤 -🍥 -🍦 -🍧 -🍨 -🍩 -🍪 -🍫 -🍬 -🍭 -🍮 -🍯 -🍰 -🍱 -🍲 -🍳 -🍴 -🍵 -🍶 -🍷 -🍸 -🍹 -🍺 -🍻 -🎀 -🎁 -🎂 -🎃 -🎄 -🎅 -🎆 -🎇 -🎈 -🎉 -🎊 -🎋 -🎌 -🎍 -🎎 -🎏 -🎐 -🎑 -🎒 -🎓 -🎠 -🎡 -🎢 -🎣 -🎤 -🎥 -🎦 -🎧 -🎨 -🎩 -🎪 -🎫 -🎬 -🎭 -🎮 -🎯 -🎰 -🎱 -🎲 -🎳 -🎴 -🎵 -🎶 -🎷 -🎸 -🎹 -🎺 -🎻 -🎼 -🎽 -🎾 -🎿 -🏀 -🏁 -🏂 -🏃 -🏄 -🏆 -🏈 -🏊 -🏠 -🏡 -🏢 -🏣 -🏥 -🏦 -🏧 -🏨 -🏩 -🏪 -🏫 -🏬 -🏭 -🏮 -🏯 -🏰 -🐌 -🐍 -🐎 -🐑 -🐒 -🐔 -🐗 -🐘 -🐙 -🐚 -🐛 -🐜 -🐝 -🐞 -🐟 -🐠 -🐡 -🐢 -🐣 -🐤 -🐥 -🐦 -🐧 -🐨 -🐩 -🐫 -🐬 -🐭 -🐮 -🐯 -🐰 -🐱 -🐲 -🐳 -🐴 -🐵 -🐶 -🐷 -🐸 -🐹 -🐺 -🐻 -🐼 -🐽 -🐾 -👀 -👂 -👃 -👄 -👅 -👆 -👇 -👈 -👉 -👊 -👋 -👌 -👍 -👎 -👏 -👐 -👑 -👒 -👓 -👔 -👕 -👖 -👗 -👘 -👙 -👚 -👛 -👜 -👝 -👞 -👟 -👠 -👡 -👢 -👣 -👤 -👦 -👧 -👨 -👩 -👪 -👫 -👮 -👯 -👰 -👱 -👲 -👳 -👴 -👵 -👶 -👷 -👸 -👹 -👺 -👻 -👼 -👽 -👾 -👿 -💀 -💁 -💂 -💃 -💄 -💅 -💆 -💇 -💈 -💉 -💊 -💋 -💌 -💍 -💎 -💏 -💐 -💑 -💒 -💓 -💔 -💕 -💖 -💗 -💘 -💙 -💚 -💛 -💜 -💝 -💞 -💟 -💠 -💡 -💢 -💣 -💤 -💥 -💦 -💧 -💨 -💩 -💪 -💫 -💬 -💮 -💯 -💰 -💲 -💳 -💴 -💵 -💸 -💹 -💺 -💻 -💼 -💽 -💾 -💿 -📀 -📁 -📂 -📃 -📄 -📅 -📆 -📇 -📈 -📉 -📊 -📋 -📌 -📍 -📎 -📏 -📐 -📑 -📒 -📓 -📔 -📕 -📖 -📗 -📘 -📙 -📚 -📛 -📜 -📝 -📞 -📟 -📠 -📡 -📢 -📣 -📤 -📥 -📦 -📧 -📨 -📩 -📪 -📫 -📮 -📰 -📱 -📲 -📳 -📴 -📶 -📷 -📹 -📺 -📻 -📼 -🔃 -🔊 -🔋 -🔌 -🔍 -🔎 -🔏 -🔐 -🔑 -🔒 -🔓 -🔔 -🔖 -🔗 -🔘 -🔙 -🔚 -🔛 -🔜 -🔝 -🔞 -🔟 -🔠 -🔡 -🔢 -🔣 -🔤 -🔥 -🔦 -🔧 -🔨 -🔩 -🔪 -🔫 -🔮 -🔯 -🔰 -🔱 -🔲 -🔳 -🔴 -🔵 -🔶 -🔷 -🔸 -🔹 -🔺 -🔻 -🔼 -🔽 -🕐 -🕑 -🕒 -🕓 -🕔 -🕕 -🕖 -🕗 -🕘 -🕙 -🕚 -🕛 -🗻 -🗼 -🗽 -🗾 -🗿 -😀 -😇 -😈 -😎 -😐 -😑 -😕 -😗 -😙 -😛 -😟 -😦 -😧 -😬 -😮 -😯 -😴 -😶 -🚁 -🚂 -🚆 -🚈 -🚊 -🚍 -🚎 -🚐 -🚔 -🚖 -🚘 -🚛 -🚜 -🚝 -🚞 -🚟 -🚠 -🚡 -🚣 -🚦 -🚮 -🚯 -🚰 -🚱 -🚳 -🚴 -🚵 -🚷 -🚸 -🚿 -🛁 -🛂 -🛃 -🛄 -🛅 -🌍 -🌎 -🌐 -🌒 -🌖 -🌗 -🌘 -🌚 -🌜 -🌝 -🌞 -🌲 -🌳 -🍋 -🍐 -🍼 -🏇 -🏉 -🏤 -🐀 -🐁 -🐂 -🐃 -🐄 -🐅 -🐆 -🐇 -🐈 -🐉 -🐊 -🐋 -🐏 -🐐 -🐓 -🐕 -🐖 -🐪 -👥 -👬 -👭 -💭 -💶 -💷 -📬 -📭 -📯 -📵 -🔀 -🔁 -🔂 -🔄 -🔅 -🔆 -🔇 -🔉 -🔕 -🔬 -🔭 -🕜 -🕝 -🕞 -🕟 -🕠 -🕡 -🕢 -🕣 -🕤 -🕥 -🕦 -🕧 diff --git a/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/utils/resources/dict/ta_dict.txt b/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/utils/resources/dict/ta_dict.txt deleted file mode 100644 index 19d81892c205627f296adbf8b20ea41aba2de5d0..0000000000000000000000000000000000000000 --- a/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/utils/resources/dict/ta_dict.txt +++ /dev/null @@ -1,128 +0,0 @@ -t -a -_ -i -m -g -/ -3 -I -L -S -V -R -C -2 -0 -1 -v -l -9 -7 -8 -. -j -p -ப -ூ -த -ம -ி -வ -ர -் -ந -ோ -ன -6 -ஆ -ற -ல -5 -ள -ா -ொ -ழ -ு -4 -ெ -ண -க -ட -ை -ே -ச -ய -ஒ -இ -அ -ங -உ -ீ -ஞ -எ -ஓ -ஃ -ஜ -ஷ -ஸ -ஏ -ஊ -ஹ -ஈ -ஐ -ௌ -ஔ -s -c -e -n -w -F -T -O -P -K -A -N -G -Y -E -M -H -U -B -o -b -D -d -r -W -u -y -f -X -k -q -h -J -z -Z -Q -x -- -' -$ -, -% -@ -é -! -# -+ -É -& -: -( -? - diff --git a/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/utils/resources/dict/te_dict.txt b/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/utils/resources/dict/te_dict.txt deleted file mode 100644 index 83d74cc7e5f899ca43b23fa690d84d70bee535e3..0000000000000000000000000000000000000000 --- a/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/utils/resources/dict/te_dict.txt +++ /dev/null @@ -1,151 +0,0 @@ -t -e -_ -i -m -g -/ -5 -I -L -S -V -R -C -2 -0 -1 -v -a -l -3 -4 -8 -9 -. -j -p -త -ె -ర -క -్ -ి -ం -చ -ే -ద -ు -7 -6 -ఉ -ా -మ -ట -ో -వ -ప -ల -శ -ఆ -య -ై -భ -' -ీ -గ -ూ -డ -ధ -హ -న -జ -స -[ -‌ -ష -అ -ణ -ఫ -బ -ఎ -; -ళ -థ -ొ -ఠ -ృ -ఒ -ఇ -ః -ఊ -ఖ -- -ఐ -ఘ -ౌ -ఏ -ఈ -ఛ -, -ఓ -ఞ -| -? -: -ఢ -" -( -” -! -+ -) -* -= -& -“ -€ -] -£ -$ -s -c -n -w -k -J -G -u -d -r -E -o -h -y -b -f -B -M -O -T -N -D -P -A -F -x -W -Y -U -H -K -X -z -Z -Q -q -É -% -# -@ -é diff --git a/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/utils/resources/models_config.yml b/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/utils/resources/models_config.yml deleted file mode 100644 index 876f3ee993f73e7d0e9af57336242c7403415f92..0000000000000000000000000000000000000000 --- a/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/utils/resources/models_config.yml +++ /dev/null @@ -1,65 +0,0 @@ -lang: - ch_lite: - det: ch_PP-OCRv3_det_infer.pth - rec: ch_PP-OCRv5_rec_infer.pth - dict: ppocrv5_dict.txt - ch_lite_v4: - det: ch_PP-OCRv3_det_infer.pth - rec: ch_PP-OCRv4_rec_infer.pth - dict: ppocr_keys_v1.txt - ch_server: - det: ch_PP-OCRv3_det_infer.pth - rec: ch_PP-OCRv5_rec_server_infer.pth - dict: ppocrv5_dict.txt - ch_server_v4: - det: ch_PP-OCRv3_det_infer.pth - rec: ch_PP-OCRv4_rec_server_infer.pth - dict: ppocr_keys_v1.txt - ch: - det: ch_PP-OCRv3_det_infer.pth - rec: ch_PP-OCRv4_rec_server_doc_infer.pth - dict: ppocrv4_doc_dict.txt - en: - det: en_PP-OCRv3_det_infer.pth - rec: en_PP-OCRv4_rec_infer.pth - dict: en_dict.txt - korean: - det: Multilingual_PP-OCRv3_det_infer.pth - rec: korean_PP-OCRv3_rec_infer.pth - dict: korean_dict.txt - japan: - det: Multilingual_PP-OCRv3_det_infer.pth - rec: japan_PP-OCRv3_rec_infer.pth - dict: japan_dict.txt - chinese_cht: - det: Multilingual_PP-OCRv3_det_infer.pth - rec: chinese_cht_PP-OCRv3_rec_infer.pth - dict: chinese_cht_dict.txt - ta: - det: Multilingual_PP-OCRv3_det_infer.pth - rec: ta_PP-OCRv3_rec_infer.pth - dict: ta_dict.txt - te: - det: Multilingual_PP-OCRv3_det_infer.pth - rec: te_PP-OCRv3_rec_infer.pth - dict: te_dict.txt - ka: - det: Multilingual_PP-OCRv3_det_infer.pth - rec: ka_PP-OCRv3_rec_infer.pth - dict: ka_dict.txt - latin: - det: en_PP-OCRv3_det_infer.pth - rec: latin_PP-OCRv3_rec_infer.pth - dict: latin_dict.txt - arabic: - det: Multilingual_PP-OCRv3_det_infer.pth - rec: arabic_PP-OCRv3_rec_infer.pth - dict: arabic_dict.txt - cyrillic: - det: Multilingual_PP-OCRv3_det_infer.pth - rec: cyrillic_PP-OCRv3_rec_infer.pth - dict: cyrillic_dict.txt - devanagari: - det: Multilingual_PP-OCRv3_det_infer.pth - rec: devanagari_PP-OCRv3_rec_infer.pth - dict: devanagari_dict.txt \ No newline at end of file diff --git a/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/tools/__init__.py b/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/tools/__init__.py deleted file mode 100644 index f64ba567a631a847c6c2ea3d345f86865056cb53..0000000000000000000000000000000000000000 --- a/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/tools/__init__.py +++ /dev/null @@ -1 +0,0 @@ -# Copyright (c) Opendatalab. All rights reserved. \ No newline at end of file diff --git a/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/tools/infer/predict_cls.py b/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/tools/infer/predict_cls.py deleted file mode 100755 index 5dea3390a6d8bbeb41d8b765eeab38d3fae4ef65..0000000000000000000000000000000000000000 --- a/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/tools/infer/predict_cls.py +++ /dev/null @@ -1,106 +0,0 @@ -import cv2 -import copy -import numpy as np -import math -import time -import torch -from ...pytorchocr.base_ocr_v20 import BaseOCRV20 -from . import pytorchocr_utility as utility -from ...pytorchocr.postprocess import build_post_process - - -class TextClassifier(BaseOCRV20): - def __init__(self, args, **kwargs): - self.device = args.device - self.cls_image_shape = [int(v) for v in args.cls_image_shape.split(",")] - self.cls_batch_num = args.cls_batch_num - self.cls_thresh = args.cls_thresh - postprocess_params = { - 'name': 'ClsPostProcess', - "label_list": args.label_list, - } - self.postprocess_op = build_post_process(postprocess_params) - - self.weights_path = args.cls_model_path - self.yaml_path = args.cls_yaml_path - network_config = utility.get_arch_config(self.weights_path) - super(TextClassifier, self).__init__(network_config, **kwargs) - - self.cls_image_shape = [int(v) for v in args.cls_image_shape.split(",")] - - self.limited_max_width = args.limited_max_width - self.limited_min_width = args.limited_min_width - - self.load_pytorch_weights(self.weights_path) - self.net.eval() - self.net.to(self.device) - - def resize_norm_img(self, img): - imgC, imgH, imgW = self.cls_image_shape - h = img.shape[0] - w = img.shape[1] - ratio = w / float(h) - imgW = max(min(imgW, self.limited_max_width), self.limited_min_width) - ratio_imgH = math.ceil(imgH * ratio) - ratio_imgH = max(ratio_imgH, self.limited_min_width) - if ratio_imgH > imgW: - resized_w = imgW - else: - resized_w = int(math.ceil(imgH * ratio)) - resized_image = cv2.resize(img, (resized_w, imgH)) - resized_image = resized_image.astype('float32') - if self.cls_image_shape[0] == 1: - resized_image = resized_image / 255 - resized_image = resized_image[np.newaxis, :] - else: - resized_image = resized_image.transpose((2, 0, 1)) / 255 - resized_image -= 0.5 - resized_image /= 0.5 - padding_im = np.zeros((imgC, imgH, imgW), dtype=np.float32) - padding_im[:, :, 0:resized_w] = resized_image - return padding_im - - def __call__(self, img_list): - img_list = copy.deepcopy(img_list) - img_num = len(img_list) - # Calculate the aspect ratio of all text bars - width_list = [] - for img in img_list: - width_list.append(img.shape[1] / float(img.shape[0])) - # Sorting can speed up the cls process - indices = np.argsort(np.array(width_list)) - - cls_res = [['', 0.0]] * img_num - batch_num = self.cls_batch_num - elapse = 0 - for beg_img_no in range(0, img_num, batch_num): - end_img_no = min(img_num, beg_img_no + batch_num) - norm_img_batch = [] - max_wh_ratio = 0 - for ino in range(beg_img_no, end_img_no): - h, w = img_list[indices[ino]].shape[0:2] - wh_ratio = w * 1.0 / h - max_wh_ratio = max(max_wh_ratio, wh_ratio) - for ino in range(beg_img_no, end_img_no): - norm_img = self.resize_norm_img(img_list[indices[ino]]) - norm_img = norm_img[np.newaxis, :] - norm_img_batch.append(norm_img) - norm_img_batch = np.concatenate(norm_img_batch) - norm_img_batch = norm_img_batch.copy() - starttime = time.time() - - with torch.no_grad(): - inp = torch.from_numpy(norm_img_batch) - inp = inp.to(self.device) - prob_out = self.net(inp) - prob_out = prob_out.cpu().numpy() - - cls_result = self.postprocess_op(prob_out) - elapse += time.time() - starttime - for rno in range(len(cls_result)): - label, score = cls_result[rno] - cls_res[indices[beg_img_no + rno]] = [label, score] - if '180' in label and score > self.cls_thresh: - img_list[indices[beg_img_no + rno]] = cv2.rotate( - img_list[indices[beg_img_no + rno]], 1) - return img_list, cls_res, elapse diff --git a/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/tools/infer/predict_det.py b/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/tools/infer/predict_det.py deleted file mode 100755 index c6f1f9c7b95e61b8ee3789246238f999b3378bd5..0000000000000000000000000000000000000000 --- a/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/tools/infer/predict_det.py +++ /dev/null @@ -1,217 +0,0 @@ -import sys - -import numpy as np -import time -import torch -from ...pytorchocr.base_ocr_v20 import BaseOCRV20 -from . import pytorchocr_utility as utility -from ...pytorchocr.data import create_operators, transform -from ...pytorchocr.postprocess import build_post_process - - -class TextDetector(BaseOCRV20): - def __init__(self, args, **kwargs): - self.args = args - self.det_algorithm = args.det_algorithm - self.device = args.device - pre_process_list = [{ - 'DetResizeForTest': { - 'limit_side_len': args.det_limit_side_len, - 'limit_type': args.det_limit_type, - } - }, { - 'NormalizeImage': { - 'std': [0.229, 0.224, 0.225], - 'mean': [0.485, 0.456, 0.406], - 'scale': '1./255.', - 'order': 'hwc' - } - }, { - 'ToCHWImage': None - }, { - 'KeepKeys': { - 'keep_keys': ['image', 'shape'] - } - }] - postprocess_params = {} - if self.det_algorithm == "DB": - postprocess_params['name'] = 'DBPostProcess' - postprocess_params["thresh"] = args.det_db_thresh - postprocess_params["box_thresh"] = args.det_db_box_thresh - postprocess_params["max_candidates"] = 1000 - postprocess_params["unclip_ratio"] = args.det_db_unclip_ratio - postprocess_params["use_dilation"] = args.use_dilation - postprocess_params["score_mode"] = args.det_db_score_mode - elif self.det_algorithm == "DB++": - postprocess_params['name'] = 'DBPostProcess' - postprocess_params["thresh"] = args.det_db_thresh - postprocess_params["box_thresh"] = args.det_db_box_thresh - postprocess_params["max_candidates"] = 1000 - postprocess_params["unclip_ratio"] = args.det_db_unclip_ratio - postprocess_params["use_dilation"] = args.use_dilation - postprocess_params["score_mode"] = args.det_db_score_mode - pre_process_list[1] = { - 'NormalizeImage': { - 'std': [1.0, 1.0, 1.0], - 'mean': - [0.48109378172549, 0.45752457890196, 0.40787054090196], - 'scale': '1./255.', - 'order': 'hwc' - } - } - elif self.det_algorithm == "EAST": - postprocess_params['name'] = 'EASTPostProcess' - postprocess_params["score_thresh"] = args.det_east_score_thresh - postprocess_params["cover_thresh"] = args.det_east_cover_thresh - postprocess_params["nms_thresh"] = args.det_east_nms_thresh - elif self.det_algorithm == "SAST": - pre_process_list[0] = { - 'DetResizeForTest': { - 'resize_long': args.det_limit_side_len - } - } - postprocess_params['name'] = 'SASTPostProcess' - postprocess_params["score_thresh"] = args.det_sast_score_thresh - postprocess_params["nms_thresh"] = args.det_sast_nms_thresh - self.det_sast_polygon = args.det_sast_polygon - if self.det_sast_polygon: - postprocess_params["sample_pts_num"] = 6 - postprocess_params["expand_scale"] = 1.2 - postprocess_params["shrink_ratio_of_width"] = 0.2 - else: - postprocess_params["sample_pts_num"] = 2 - postprocess_params["expand_scale"] = 1.0 - postprocess_params["shrink_ratio_of_width"] = 0.3 - elif self.det_algorithm == "PSE": - postprocess_params['name'] = 'PSEPostProcess' - postprocess_params["thresh"] = args.det_pse_thresh - postprocess_params["box_thresh"] = args.det_pse_box_thresh - postprocess_params["min_area"] = args.det_pse_min_area - postprocess_params["box_type"] = args.det_pse_box_type - postprocess_params["scale"] = args.det_pse_scale - self.det_pse_box_type = args.det_pse_box_type - elif self.det_algorithm == "FCE": - pre_process_list[0] = { - 'DetResizeForTest': { - 'rescale_img': [1080, 736] - } - } - postprocess_params['name'] = 'FCEPostProcess' - postprocess_params["scales"] = args.scales - postprocess_params["alpha"] = args.alpha - postprocess_params["beta"] = args.beta - postprocess_params["fourier_degree"] = args.fourier_degree - postprocess_params["box_type"] = args.det_fce_box_type - else: - print("unknown det_algorithm:{}".format(self.det_algorithm)) - sys.exit(0) - - self.preprocess_op = create_operators(pre_process_list) - self.postprocess_op = build_post_process(postprocess_params) - - self.weights_path = args.det_model_path - self.yaml_path = args.det_yaml_path - network_config = utility.get_arch_config(self.weights_path) - super(TextDetector, self).__init__(network_config, **kwargs) - self.load_pytorch_weights(self.weights_path) - self.net.eval() - self.net.to(self.device) - - def order_points_clockwise(self, pts): - """ - reference from: https://github.com/jrosebr1/imutils/blob/master/imutils/perspective.py - # sort the points based on their x-coordinates - """ - xSorted = pts[np.argsort(pts[:, 0]), :] - - # grab the left-most and right-most points from the sorted - # x-roodinate points - leftMost = xSorted[:2, :] - rightMost = xSorted[2:, :] - - # now, sort the left-most coordinates according to their - # y-coordinates so we can grab the top-left and bottom-left - # points, respectively - leftMost = leftMost[np.argsort(leftMost[:, 1]), :] - (tl, bl) = leftMost - - rightMost = rightMost[np.argsort(rightMost[:, 1]), :] - (tr, br) = rightMost - - rect = np.array([tl, tr, br, bl], dtype="float32") - return rect - - def clip_det_res(self, points, img_height, img_width): - for pno in range(points.shape[0]): - points[pno, 0] = int(min(max(points[pno, 0], 0), img_width - 1)) - points[pno, 1] = int(min(max(points[pno, 1], 0), img_height - 1)) - return points - - def filter_tag_det_res(self, dt_boxes, image_shape): - img_height, img_width = image_shape[0:2] - dt_boxes_new = [] - for box in dt_boxes: - box = self.order_points_clockwise(box) - box = self.clip_det_res(box, img_height, img_width) - rect_width = int(np.linalg.norm(box[0] - box[1])) - rect_height = int(np.linalg.norm(box[0] - box[3])) - if rect_width <= 3 or rect_height <= 3: - continue - dt_boxes_new.append(box) - dt_boxes = np.array(dt_boxes_new) - return dt_boxes - - def filter_tag_det_res_only_clip(self, dt_boxes, image_shape): - img_height, img_width = image_shape[0:2] - dt_boxes_new = [] - for box in dt_boxes: - box = self.clip_det_res(box, img_height, img_width) - dt_boxes_new.append(box) - dt_boxes = np.array(dt_boxes_new) - return dt_boxes - - def __call__(self, img): - ori_im = img.copy() - data = {'image': img} - data = transform(data, self.preprocess_op) - img, shape_list = data - if img is None: - return None, 0 - img = np.expand_dims(img, axis=0) - shape_list = np.expand_dims(shape_list, axis=0) - img = img.copy() - starttime = time.time() - - with torch.no_grad(): - inp = torch.from_numpy(img) - inp = inp.to(self.device) - outputs = self.net(inp) - - preds = {} - if self.det_algorithm == "EAST": - preds['f_geo'] = outputs['f_geo'].cpu().numpy() - preds['f_score'] = outputs['f_score'].cpu().numpy() - elif self.det_algorithm == 'SAST': - preds['f_border'] = outputs['f_border'].cpu().numpy() - preds['f_score'] = outputs['f_score'].cpu().numpy() - preds['f_tco'] = outputs['f_tco'].cpu().numpy() - preds['f_tvo'] = outputs['f_tvo'].cpu().numpy() - elif self.det_algorithm in ['DB', 'PSE', 'DB++']: - preds['maps'] = outputs['maps'].cpu().numpy() - elif self.det_algorithm == 'FCE': - for i, (k, output) in enumerate(outputs.items()): - preds['level_{}'.format(i)] = output - else: - raise NotImplementedError - - post_result = self.postprocess_op(preds, shape_list) - dt_boxes = post_result[0]['points'] - if (self.det_algorithm == "SAST" and - self.det_sast_polygon) or (self.det_algorithm in ["PSE", "FCE"] and - self.postprocess_op.box_type == 'poly'): - dt_boxes = self.filter_tag_det_res_only_clip(dt_boxes, ori_im.shape) - else: - dt_boxes = self.filter_tag_det_res(dt_boxes, ori_im.shape) - - elapse = time.time() - starttime - return dt_boxes, elapse diff --git a/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/tools/infer/predict_rec.py b/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/tools/infer/predict_rec.py deleted file mode 100755 index c06ca5fe3f5bd0c4e38502ff548e2b488eeac233..0000000000000000000000000000000000000000 --- a/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/tools/infer/predict_rec.py +++ /dev/null @@ -1,446 +0,0 @@ -from PIL import Image -import cv2 -import numpy as np -import math -import time -import torch -from tqdm import tqdm - -from ...pytorchocr.base_ocr_v20 import BaseOCRV20 -from . import pytorchocr_utility as utility -from ...pytorchocr.postprocess import build_post_process - - -class TextRecognizer(BaseOCRV20): - def __init__(self, args, **kwargs): - self.device = args.device - self.rec_image_shape = [int(v) for v in args.rec_image_shape.split(",")] - self.character_type = args.rec_char_type - self.rec_batch_num = args.rec_batch_num - self.rec_algorithm = args.rec_algorithm - self.max_text_length = args.max_text_length - postprocess_params = { - 'name': 'CTCLabelDecode', - "character_type": args.rec_char_type, - "character_dict_path": args.rec_char_dict_path, - "use_space_char": args.use_space_char - } - if self.rec_algorithm == "SRN": - postprocess_params = { - 'name': 'SRNLabelDecode', - "character_type": args.rec_char_type, - "character_dict_path": args.rec_char_dict_path, - "use_space_char": args.use_space_char - } - elif self.rec_algorithm == "RARE": - postprocess_params = { - 'name': 'AttnLabelDecode', - "character_type": args.rec_char_type, - "character_dict_path": args.rec_char_dict_path, - "use_space_char": args.use_space_char - } - elif self.rec_algorithm == 'NRTR': - postprocess_params = { - 'name': 'NRTRLabelDecode', - "character_dict_path": args.rec_char_dict_path, - "use_space_char": args.use_space_char - } - elif self.rec_algorithm == "SAR": - postprocess_params = { - 'name': 'SARLabelDecode', - "character_dict_path": args.rec_char_dict_path, - "use_space_char": args.use_space_char - } - elif self.rec_algorithm == 'ViTSTR': - postprocess_params = { - 'name': 'ViTSTRLabelDecode', - "character_dict_path": args.rec_char_dict_path, - "use_space_char": args.use_space_char - } - elif self.rec_algorithm == "CAN": - self.inverse = args.rec_image_inverse - postprocess_params = { - 'name': 'CANLabelDecode', - "character_dict_path": args.rec_char_dict_path, - "use_space_char": args.use_space_char - } - elif self.rec_algorithm == 'RFL': - postprocess_params = { - 'name': 'RFLLabelDecode', - "character_dict_path": None, - "use_space_char": args.use_space_char - } - self.postprocess_op = build_post_process(postprocess_params) - - self.limited_max_width = args.limited_max_width - self.limited_min_width = args.limited_min_width - - self.weights_path = args.rec_model_path - self.yaml_path = args.rec_yaml_path - - network_config = utility.get_arch_config(self.weights_path) - weights = self.read_pytorch_weights(self.weights_path) - - self.out_channels = self.get_out_channels(weights) - if self.rec_algorithm == 'NRTR': - self.out_channels = list(weights.values())[-1].numpy().shape[0] - elif self.rec_algorithm == 'SAR': - self.out_channels = list(weights.values())[-3].numpy().shape[0] - - kwargs['out_channels'] = self.out_channels - super(TextRecognizer, self).__init__(network_config, **kwargs) - - self.load_state_dict(weights) - self.net.eval() - self.net.to(self.device) - - def resize_norm_img(self, img, max_wh_ratio): - imgC, imgH, imgW = self.rec_image_shape - if self.rec_algorithm == 'NRTR' or self.rec_algorithm == 'ViTSTR': - img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) - # return padding_im - image_pil = Image.fromarray(np.uint8(img)) - if self.rec_algorithm == 'ViTSTR': - img = image_pil.resize([imgW, imgH], Image.BICUBIC) - else: - img = image_pil.resize([imgW, imgH], Image.ANTIALIAS) - img = np.array(img) - norm_img = np.expand_dims(img, -1) - norm_img = norm_img.transpose((2, 0, 1)) - if self.rec_algorithm == 'ViTSTR': - norm_img = norm_img.astype(np.float32) / 255. - else: - norm_img = norm_img.astype(np.float32) / 128. - 1. - return norm_img - elif self.rec_algorithm == 'RFL': - img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) - resized_image = cv2.resize( - img, (imgW, imgH), interpolation=cv2.INTER_CUBIC) - resized_image = resized_image.astype('float32') - resized_image = resized_image / 255 - resized_image = resized_image[np.newaxis, :] - resized_image -= 0.5 - resized_image /= 0.5 - return resized_image - - assert imgC == img.shape[2] - max_wh_ratio = max(max_wh_ratio, imgW / imgH) - imgW = int((imgH * max_wh_ratio)) - imgW = max(min(imgW, self.limited_max_width), self.limited_min_width) - h, w = img.shape[:2] - ratio = w / float(h) - ratio_imgH = math.ceil(imgH * ratio) - ratio_imgH = max(ratio_imgH, self.limited_min_width) - if ratio_imgH > imgW: - resized_w = imgW - else: - resized_w = int(ratio_imgH) - resized_image = cv2.resize(img, (resized_w, imgH)) - resized_image = resized_image.astype('float32') - resized_image = resized_image.transpose((2, 0, 1)) / 255 - resized_image -= 0.5 - resized_image /= 0.5 - padding_im = np.zeros((imgC, imgH, imgW), dtype=np.float32) - padding_im[:, :, 0:resized_w] = resized_image - return padding_im - - def resize_norm_img_svtr(self, img, image_shape): - - imgC, imgH, imgW = image_shape - resized_image = cv2.resize( - img, (imgW, imgH), interpolation=cv2.INTER_LINEAR) - resized_image = resized_image.astype('float32') - resized_image = resized_image.transpose((2, 0, 1)) / 255 - resized_image -= 0.5 - resized_image /= 0.5 - return resized_image - - - def resize_norm_img_srn(self, img, image_shape): - imgC, imgH, imgW = image_shape - - img_black = np.zeros((imgH, imgW)) - im_hei = img.shape[0] - im_wid = img.shape[1] - - if im_wid <= im_hei * 1: - img_new = cv2.resize(img, (imgH * 1, imgH)) - elif im_wid <= im_hei * 2: - img_new = cv2.resize(img, (imgH * 2, imgH)) - elif im_wid <= im_hei * 3: - img_new = cv2.resize(img, (imgH * 3, imgH)) - else: - img_new = cv2.resize(img, (imgW, imgH)) - - img_np = np.asarray(img_new) - img_np = cv2.cvtColor(img_np, cv2.COLOR_BGR2GRAY) - img_black[:, 0:img_np.shape[1]] = img_np - img_black = img_black[:, :, np.newaxis] - - row, col, c = img_black.shape - c = 1 - - return np.reshape(img_black, (c, row, col)).astype(np.float32) - - def srn_other_inputs(self, image_shape, num_heads, max_text_length): - - imgC, imgH, imgW = image_shape - feature_dim = int((imgH / 8) * (imgW / 8)) - - encoder_word_pos = np.array(range(0, feature_dim)).reshape( - (feature_dim, 1)).astype('int64') - gsrm_word_pos = np.array(range(0, max_text_length)).reshape( - (max_text_length, 1)).astype('int64') - - gsrm_attn_bias_data = np.ones((1, max_text_length, max_text_length)) - gsrm_slf_attn_bias1 = np.triu(gsrm_attn_bias_data, 1).reshape( - [-1, 1, max_text_length, max_text_length]) - gsrm_slf_attn_bias1 = np.tile( - gsrm_slf_attn_bias1, - [1, num_heads, 1, 1]).astype('float32') * [-1e9] - - gsrm_slf_attn_bias2 = np.tril(gsrm_attn_bias_data, -1).reshape( - [-1, 1, max_text_length, max_text_length]) - gsrm_slf_attn_bias2 = np.tile( - gsrm_slf_attn_bias2, - [1, num_heads, 1, 1]).astype('float32') * [-1e9] - - encoder_word_pos = encoder_word_pos[np.newaxis, :] - gsrm_word_pos = gsrm_word_pos[np.newaxis, :] - - return [ - encoder_word_pos, gsrm_word_pos, gsrm_slf_attn_bias1, - gsrm_slf_attn_bias2 - ] - - def process_image_srn(self, img, image_shape, num_heads, max_text_length): - norm_img = self.resize_norm_img_srn(img, image_shape) - norm_img = norm_img[np.newaxis, :] - - [encoder_word_pos, gsrm_word_pos, gsrm_slf_attn_bias1, gsrm_slf_attn_bias2] = \ - self.srn_other_inputs(image_shape, num_heads, max_text_length) - - gsrm_slf_attn_bias1 = gsrm_slf_attn_bias1.astype(np.float32) - gsrm_slf_attn_bias2 = gsrm_slf_attn_bias2.astype(np.float32) - encoder_word_pos = encoder_word_pos.astype(np.int64) - gsrm_word_pos = gsrm_word_pos.astype(np.int64) - - return (norm_img, encoder_word_pos, gsrm_word_pos, gsrm_slf_attn_bias1, - gsrm_slf_attn_bias2) - - def resize_norm_img_sar(self, img, image_shape, - width_downsample_ratio=0.25): - imgC, imgH, imgW_min, imgW_max = image_shape - h = img.shape[0] - w = img.shape[1] - valid_ratio = 1.0 - # make sure new_width is an integral multiple of width_divisor. - width_divisor = int(1 / width_downsample_ratio) - # resize - ratio = w / float(h) - resize_w = math.ceil(imgH * ratio) - if resize_w % width_divisor != 0: - resize_w = round(resize_w / width_divisor) * width_divisor - if imgW_min is not None: - resize_w = max(imgW_min, resize_w) - if imgW_max is not None: - valid_ratio = min(1.0, 1.0 * resize_w / imgW_max) - resize_w = min(imgW_max, resize_w) - resized_image = cv2.resize(img, (resize_w, imgH)) - resized_image = resized_image.astype('float32') - # norm - if image_shape[0] == 1: - resized_image = resized_image / 255 - resized_image = resized_image[np.newaxis, :] - else: - resized_image = resized_image.transpose((2, 0, 1)) / 255 - resized_image -= 0.5 - resized_image /= 0.5 - resize_shape = resized_image.shape - padding_im = -1.0 * np.ones((imgC, imgH, imgW_max), dtype=np.float32) - padding_im[:, :, 0:resize_w] = resized_image - pad_shape = padding_im.shape - - return padding_im, resize_shape, pad_shape, valid_ratio - - - def norm_img_can(self, img, image_shape): - - img = cv2.cvtColor( - img, cv2.COLOR_BGR2GRAY) # CAN only predict gray scale image - - if self.inverse: - img = 255 - img - - if self.rec_image_shape[0] == 1: - h, w = img.shape - _, imgH, imgW = self.rec_image_shape - if h < imgH or w < imgW: - padding_h = max(imgH - h, 0) - padding_w = max(imgW - w, 0) - img_padded = np.pad(img, ((0, padding_h), (0, padding_w)), - 'constant', - constant_values=(255)) - img = img_padded - - img = np.expand_dims(img, 0) / 255.0 # h,w,c -> c,h,w - img = img.astype('float32') - - return img - - def __call__(self, img_list, tqdm_enable=False): - img_num = len(img_list) - # Calculate the aspect ratio of all text bars - width_list = [] - for img in img_list: - width_list.append(img.shape[1] / float(img.shape[0])) - # Sorting can speed up the recognition process - indices = np.argsort(np.array(width_list)) - - # rec_res = [] - rec_res = [['', 0.0]] * img_num - batch_num = self.rec_batch_num - elapse = 0 - # for beg_img_no in range(0, img_num, batch_num): - with tqdm(total=img_num, desc='OCR-rec Predict', disable=not tqdm_enable) as pbar: - index = 0 - for beg_img_no in range(0, img_num, batch_num): - end_img_no = min(img_num, beg_img_no + batch_num) - norm_img_batch = [] - max_wh_ratio = 0 - for ino in range(beg_img_no, end_img_no): - # h, w = img_list[ino].shape[0:2] - h, w = img_list[indices[ino]].shape[0:2] - wh_ratio = w * 1.0 / h - max_wh_ratio = max(max_wh_ratio, wh_ratio) - for ino in range(beg_img_no, end_img_no): - if self.rec_algorithm == "SAR": - norm_img, _, _, valid_ratio = self.resize_norm_img_sar( - img_list[indices[ino]], self.rec_image_shape) - norm_img = norm_img[np.newaxis, :] - valid_ratio = np.expand_dims(valid_ratio, axis=0) - valid_ratios = [] - valid_ratios.append(valid_ratio) - norm_img_batch.append(norm_img) - - elif self.rec_algorithm == "SVTR": - norm_img = self.resize_norm_img_svtr(img_list[indices[ino]], - self.rec_image_shape) - norm_img = norm_img[np.newaxis, :] - norm_img_batch.append(norm_img) - elif self.rec_algorithm == "SRN": - norm_img = self.process_image_srn(img_list[indices[ino]], - self.rec_image_shape, 8, - self.max_text_length) - encoder_word_pos_list = [] - gsrm_word_pos_list = [] - gsrm_slf_attn_bias1_list = [] - gsrm_slf_attn_bias2_list = [] - encoder_word_pos_list.append(norm_img[1]) - gsrm_word_pos_list.append(norm_img[2]) - gsrm_slf_attn_bias1_list.append(norm_img[3]) - gsrm_slf_attn_bias2_list.append(norm_img[4]) - norm_img_batch.append(norm_img[0]) - elif self.rec_algorithm == "CAN": - norm_img = self.norm_img_can(img_list[indices[ino]], - max_wh_ratio) - norm_img = norm_img[np.newaxis, :] - norm_img_batch.append(norm_img) - norm_image_mask = np.ones(norm_img.shape, dtype='float32') - word_label = np.ones([1, 36], dtype='int64') - norm_img_mask_batch = [] - word_label_list = [] - norm_img_mask_batch.append(norm_image_mask) - word_label_list.append(word_label) - else: - norm_img = self.resize_norm_img(img_list[indices[ino]], - max_wh_ratio) - norm_img = norm_img[np.newaxis, :] - norm_img_batch.append(norm_img) - norm_img_batch = np.concatenate(norm_img_batch) - norm_img_batch = norm_img_batch.copy() - - if self.rec_algorithm == "SRN": - starttime = time.time() - encoder_word_pos_list = np.concatenate(encoder_word_pos_list) - gsrm_word_pos_list = np.concatenate(gsrm_word_pos_list) - gsrm_slf_attn_bias1_list = np.concatenate( - gsrm_slf_attn_bias1_list) - gsrm_slf_attn_bias2_list = np.concatenate( - gsrm_slf_attn_bias2_list) - - with torch.no_grad(): - inp = torch.from_numpy(norm_img_batch) - encoder_word_pos_inp = torch.from_numpy(encoder_word_pos_list) - gsrm_word_pos_inp = torch.from_numpy(gsrm_word_pos_list) - gsrm_slf_attn_bias1_inp = torch.from_numpy(gsrm_slf_attn_bias1_list) - gsrm_slf_attn_bias2_inp = torch.from_numpy(gsrm_slf_attn_bias2_list) - - inp = inp.to(self.device) - encoder_word_pos_inp = encoder_word_pos_inp.to(self.device) - gsrm_word_pos_inp = gsrm_word_pos_inp.to(self.device) - gsrm_slf_attn_bias1_inp = gsrm_slf_attn_bias1_inp.to(self.device) - gsrm_slf_attn_bias2_inp = gsrm_slf_attn_bias2_inp.to(self.device) - - backbone_out = self.net.backbone(inp) # backbone_feat - prob_out = self.net.head(backbone_out, [encoder_word_pos_inp, gsrm_word_pos_inp, gsrm_slf_attn_bias1_inp, gsrm_slf_attn_bias2_inp]) - # preds = {"predict": prob_out[2]} - preds = {"predict": prob_out["predict"]} - - elif self.rec_algorithm == "SAR": - starttime = time.time() - # valid_ratios = np.concatenate(valid_ratios) - # inputs = [ - # norm_img_batch, - # valid_ratios, - # ] - - with torch.no_grad(): - inp = torch.from_numpy(norm_img_batch) - inp = inp.to(self.device) - preds = self.net(inp) - - elif self.rec_algorithm == "CAN": - starttime = time.time() - norm_img_mask_batch = np.concatenate(norm_img_mask_batch) - word_label_list = np.concatenate(word_label_list) - inputs = [norm_img_batch, norm_img_mask_batch, word_label_list] - - inp = [torch.from_numpy(e_i) for e_i in inputs] - inp = [e_i.to(self.device) for e_i in inp] - with torch.no_grad(): - outputs = self.net(inp) - outputs = [v.cpu().numpy() for k, v in enumerate(outputs)] - - preds = outputs - - else: - starttime = time.time() - - with torch.no_grad(): - inp = torch.from_numpy(norm_img_batch) - inp = inp.to(self.device) - prob_out = self.net(inp) - - if isinstance(prob_out, list): - preds = [v.cpu().numpy() for v in prob_out] - else: - preds = prob_out.cpu().numpy() - - rec_result = self.postprocess_op(preds) - for rno in range(len(rec_result)): - rec_res[indices[beg_img_no + rno]] = rec_result[rno] - elapse += time.time() - starttime - - # 更新进度条,每次增加batch_size,但要注意最后一个batch可能不足batch_size - current_batch_size = min(batch_num, img_num - index * batch_num) - index += 1 - pbar.update(current_batch_size) - - # Fix NaN values in recognition results - for i in range(len(rec_res)): - text, score = rec_res[i] - if isinstance(score, float) and math.isnan(score): - rec_res[i] = (text, 0.0) - - return rec_res, elapse diff --git a/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/tools/infer/predict_system.py b/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/tools/infer/predict_system.py deleted file mode 100755 index e35b9a4b1535ad89d7df2e2be6d31c5475d2acb2..0000000000000000000000000000000000000000 --- a/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/tools/infer/predict_system.py +++ /dev/null @@ -1,104 +0,0 @@ -import cv2 -import copy -import numpy as np - -from . import predict_rec -from . import predict_det -from . import predict_cls - - -class TextSystem(object): - def __init__(self, args, **kwargs): - self.text_detector = predict_det.TextDetector(args, **kwargs) - self.text_recognizer = predict_rec.TextRecognizer(args, **kwargs) - self.use_angle_cls = args.use_angle_cls - self.drop_score = args.drop_score - if self.use_angle_cls: - self.text_classifier = predict_cls.TextClassifier(args, **kwargs) - - def get_rotate_crop_image(self, img, points): - ''' - img_height, img_width = img.shape[0:2] - left = int(np.min(points[:, 0])) - right = int(np.max(points[:, 0])) - top = int(np.min(points[:, 1])) - bottom = int(np.max(points[:, 1])) - img_crop = img[top:bottom, left:right, :].copy() - points[:, 0] = points[:, 0] - left - points[:, 1] = points[:, 1] - top - ''' - img_crop_width = int( - max( - np.linalg.norm(points[0] - points[1]), - np.linalg.norm(points[2] - points[3]))) - img_crop_height = int( - max( - np.linalg.norm(points[0] - points[3]), - np.linalg.norm(points[1] - points[2]))) - pts_std = np.float32([[0, 0], [img_crop_width, 0], - [img_crop_width, img_crop_height], - [0, img_crop_height]]) - M = cv2.getPerspectiveTransform(points, pts_std) - dst_img = cv2.warpPerspective( - img, - M, (img_crop_width, img_crop_height), - borderMode=cv2.BORDER_REPLICATE, - flags=cv2.INTER_CUBIC) - dst_img_height, dst_img_width = dst_img.shape[0:2] - if dst_img_height * 1.0 / dst_img_width >= 1.5: - dst_img = np.rot90(dst_img) - return dst_img - - def __call__(self, img): - ori_im = img.copy() - dt_boxes, elapse = self.text_detector(img) - print("dt_boxes num : {}, elapse : {}".format( - len(dt_boxes), elapse)) - if dt_boxes is None: - return None, None - img_crop_list = [] - - dt_boxes = sorted_boxes(dt_boxes) - - for bno in range(len(dt_boxes)): - tmp_box = copy.deepcopy(dt_boxes[bno]) - img_crop = self.get_rotate_crop_image(ori_im, tmp_box) - img_crop_list.append(img_crop) - if self.use_angle_cls: - img_crop_list, angle_list, elapse = self.text_classifier( - img_crop_list) - print("cls num : {}, elapse : {}".format( - len(img_crop_list), elapse)) - - rec_res, elapse = self.text_recognizer(img_crop_list) - print("rec_res num : {}, elapse : {}".format( - len(rec_res), elapse)) - # self.print_draw_crop_rec_res(img_crop_list, rec_res) - filter_boxes, filter_rec_res = [], [] - for box, rec_reuslt in zip(dt_boxes, rec_res): - text, score = rec_reuslt - if score >= self.drop_score: - filter_boxes.append(box) - filter_rec_res.append(rec_reuslt) - return filter_boxes, filter_rec_res - - -def sorted_boxes(dt_boxes): - """ - Sort text boxes in order from top to bottom, left to right - args: - dt_boxes(array):detected text boxes with shape [4, 2] - return: - sorted boxes(array) with shape [4, 2] - """ - num_boxes = dt_boxes.shape[0] - sorted_boxes = sorted(dt_boxes, key=lambda x: (x[0][1], x[0][0])) - _boxes = list(sorted_boxes) - - for i in range(num_boxes - 1): - if abs(_boxes[i + 1][0][1] - _boxes[i][0][1]) < 10 and \ - (_boxes[i + 1][0][0] < _boxes[i][0][0]): - tmp = _boxes[i] - _boxes[i] = _boxes[i + 1] - _boxes[i + 1] = tmp - return _boxes diff --git a/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/tools/infer/pytorchocr_utility.py b/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/tools/infer/pytorchocr_utility.py deleted file mode 100755 index 912d124ef4683740d014d881cc825673d577b628..0000000000000000000000000000000000000000 --- a/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/tools/infer/pytorchocr_utility.py +++ /dev/null @@ -1,227 +0,0 @@ -import os -import math -from pathlib import Path -import numpy as np -import cv2 -import argparse - - -root_dir = Path(__file__).resolve().parent.parent.parent -DEFAULT_CFG_PATH = root_dir / "pytorchocr" / "utils" / "resources" / "arch_config.yaml" - - -def init_args(): - def str2bool(v): - return v.lower() in ("true", "t", "1") - - parser = argparse.ArgumentParser() - # params for prediction engine - parser.add_argument("--use_gpu", type=str2bool, default=False) - parser.add_argument("--det", type=str2bool, default=True) - parser.add_argument("--rec", type=str2bool, default=True) - parser.add_argument("--device", type=str, default='cpu') - # parser.add_argument("--ir_optim", type=str2bool, default=True) - # parser.add_argument("--use_tensorrt", type=str2bool, default=False) - # parser.add_argument("--use_fp16", type=str2bool, default=False) - parser.add_argument("--gpu_mem", type=int, default=500) - parser.add_argument("--warmup", type=str2bool, default=False) - - # params for text detector - parser.add_argument("--image_dir", type=str) - parser.add_argument("--det_algorithm", type=str, default='DB') - parser.add_argument("--det_model_path", type=str) - parser.add_argument("--det_limit_side_len", type=float, default=960) - parser.add_argument("--det_limit_type", type=str, default='max') - - # DB parmas - parser.add_argument("--det_db_thresh", type=float, default=0.3) - parser.add_argument("--det_db_box_thresh", type=float, default=0.6) - parser.add_argument("--det_db_unclip_ratio", type=float, default=1.5) - parser.add_argument("--max_batch_size", type=int, default=10) - parser.add_argument("--use_dilation", type=str2bool, default=False) - parser.add_argument("--det_db_score_mode", type=str, default="fast") - - # EAST parmas - parser.add_argument("--det_east_score_thresh", type=float, default=0.8) - parser.add_argument("--det_east_cover_thresh", type=float, default=0.1) - parser.add_argument("--det_east_nms_thresh", type=float, default=0.2) - - # SAST parmas - parser.add_argument("--det_sast_score_thresh", type=float, default=0.5) - parser.add_argument("--det_sast_nms_thresh", type=float, default=0.2) - parser.add_argument("--det_sast_polygon", type=str2bool, default=False) - - # PSE parmas - parser.add_argument("--det_pse_thresh", type=float, default=0) - parser.add_argument("--det_pse_box_thresh", type=float, default=0.85) - parser.add_argument("--det_pse_min_area", type=float, default=16) - parser.add_argument("--det_pse_box_type", type=str, default='box') - parser.add_argument("--det_pse_scale", type=int, default=1) - - # FCE parmas - parser.add_argument("--scales", type=list, default=[8, 16, 32]) - parser.add_argument("--alpha", type=float, default=1.0) - parser.add_argument("--beta", type=float, default=1.0) - parser.add_argument("--fourier_degree", type=int, default=5) - parser.add_argument("--det_fce_box_type", type=str, default='poly') - - # params for text recognizer - parser.add_argument("--rec_algorithm", type=str, default='CRNN') - parser.add_argument("--rec_model_path", type=str) - parser.add_argument("--rec_image_inverse", type=str2bool, default=True) - parser.add_argument("--rec_image_shape", type=str, default="3, 48, 320") - parser.add_argument("--rec_char_type", type=str, default='ch') - parser.add_argument("--rec_batch_num", type=int, default=6) - parser.add_argument("--max_text_length", type=int, default=25) - - parser.add_argument("--use_space_char", type=str2bool, default=True) - parser.add_argument("--drop_score", type=float, default=0.5) - parser.add_argument("--limited_max_width", type=int, default=1280) - parser.add_argument("--limited_min_width", type=int, default=16) - - parser.add_argument( - "--vis_font_path", type=str, - default=os.path.join(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))), 'doc/fonts/simfang.ttf')) - parser.add_argument( - "--rec_char_dict_path", - type=str, - default=os.path.join(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))), - 'pytorchocr/utils/ppocr_keys_v1.txt')) - - # params for text classifier - parser.add_argument("--use_angle_cls", type=str2bool, default=False) - parser.add_argument("--cls_model_path", type=str) - parser.add_argument("--cls_image_shape", type=str, default="3, 48, 192") - parser.add_argument("--label_list", type=list, default=['0', '180']) - parser.add_argument("--cls_batch_num", type=int, default=6) - parser.add_argument("--cls_thresh", type=float, default=0.9) - - parser.add_argument("--enable_mkldnn", type=str2bool, default=False) - parser.add_argument("--use_pdserving", type=str2bool, default=False) - - # params for e2e - parser.add_argument("--e2e_algorithm", type=str, default='PGNet') - parser.add_argument("--e2e_model_path", type=str) - parser.add_argument("--e2e_limit_side_len", type=float, default=768) - parser.add_argument("--e2e_limit_type", type=str, default='max') - - # PGNet parmas - parser.add_argument("--e2e_pgnet_score_thresh", type=float, default=0.5) - parser.add_argument( - "--e2e_char_dict_path", type=str, - default=os.path.join(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))), - 'pytorchocr/utils/ic15_dict.txt')) - parser.add_argument("--e2e_pgnet_valid_set", type=str, default='totaltext') - parser.add_argument("--e2e_pgnet_polygon", type=bool, default=True) - parser.add_argument("--e2e_pgnet_mode", type=str, default='fast') - - # SR parmas - parser.add_argument("--sr_model_path", type=str) - parser.add_argument("--sr_image_shape", type=str, default="3, 32, 128") - parser.add_argument("--sr_batch_num", type=int, default=1) - - # params .yaml - parser.add_argument("--det_yaml_path", type=str, default=None) - parser.add_argument("--rec_yaml_path", type=str, default=None) - parser.add_argument("--cls_yaml_path", type=str, default=None) - parser.add_argument("--e2e_yaml_path", type=str, default=None) - parser.add_argument("--sr_yaml_path", type=str, default=None) - - # multi-process - parser.add_argument("--use_mp", type=str2bool, default=False) - parser.add_argument("--total_process_num", type=int, default=1) - parser.add_argument("--process_id", type=int, default=0) - - parser.add_argument("--benchmark", type=str2bool, default=False) - parser.add_argument("--save_log_path", type=str, default="./log_output/") - - parser.add_argument("--show_log", type=str2bool, default=True) - - return parser - -def parse_args(): - parser = init_args() - return parser.parse_args() - -def get_default_config(args): - return vars(args) - - -def read_network_config_from_yaml(yaml_path, char_num=None): - if not os.path.exists(yaml_path): - raise FileNotFoundError('{} is not existed.'.format(yaml_path)) - import yaml - with open(yaml_path, encoding='utf-8') as f: - res = yaml.safe_load(f) - if res.get('Architecture') is None: - raise ValueError('{} has no Architecture'.format(yaml_path)) - if res['Architecture']['Head']['name'] == 'MultiHead' and char_num is not None: - res['Architecture']['Head']['out_channels_list'] = { - 'CTCLabelDecode': char_num, - 'SARLabelDecode': char_num + 2, - 'NRTRLabelDecode': char_num + 3 - } - return res['Architecture'] - -def AnalysisConfig(weights_path, yaml_path=None, char_num=None): - if not os.path.exists(os.path.abspath(weights_path)): - raise FileNotFoundError('{} is not found.'.format(weights_path)) - - if yaml_path is not None: - return read_network_config_from_yaml(yaml_path, char_num=char_num) - - -def resize_img(img, input_size=600): - """ - resize img and limit the longest side of the image to input_size - """ - img = np.array(img) - im_shape = img.shape - im_size_max = np.max(im_shape[0:2]) - im_scale = float(input_size) / float(im_size_max) - img = cv2.resize(img, None, None, fx=im_scale, fy=im_scale) - return img - - -def str_count(s): - """ - Count the number of Chinese characters, - a single English character and a single number - equal to half the length of Chinese characters. - args: - s(string): the input of string - return(int): - the number of Chinese characters - """ - import string - count_zh = count_pu = 0 - s_len = len(s) - en_dg_count = 0 - for c in s: - if c in string.ascii_letters or c.isdigit() or c.isspace(): - en_dg_count += 1 - elif c.isalpha(): - count_zh += 1 - else: - count_pu += 1 - return s_len - math.ceil(en_dg_count / 2) - - -def base64_to_cv2(b64str): - import base64 - data = base64.b64decode(b64str.encode('utf8')) - data = np.fromstring(data, np.uint8) - data = cv2.imdecode(data, cv2.IMREAD_COLOR) - return data - - -def get_arch_config(model_path): - from omegaconf import OmegaConf - all_arch_config = OmegaConf.load(DEFAULT_CFG_PATH) - path = Path(model_path) - file_name = path.stem - if file_name not in all_arch_config: - raise ValueError(f"architecture {file_name} is not in arch_config.yaml") - - arch_config = all_arch_config[file_name] - return arch_config \ No newline at end of file diff --git a/magic_pdf/model/sub_modules/reading_oreder/__init__.py b/magic_pdf/model/sub_modules/reading_oreder/__init__.py deleted file mode 100644 index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000 diff --git a/magic_pdf/model/sub_modules/reading_oreder/layoutreader/__init__.py b/magic_pdf/model/sub_modules/reading_oreder/layoutreader/__init__.py deleted file mode 100644 index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000 diff --git a/magic_pdf/model/sub_modules/reading_oreder/layoutreader/helpers.py b/magic_pdf/model/sub_modules/reading_oreder/layoutreader/helpers.py deleted file mode 100644 index dfe71a89cf99e1f5807055115ceeda3abbceb363..0000000000000000000000000000000000000000 --- a/magic_pdf/model/sub_modules/reading_oreder/layoutreader/helpers.py +++ /dev/null @@ -1,125 +0,0 @@ -from collections import defaultdict -from typing import List, Dict - -import torch -from transformers import LayoutLMv3ForTokenClassification - -MAX_LEN = 510 -CLS_TOKEN_ID = 0 -UNK_TOKEN_ID = 3 -EOS_TOKEN_ID = 2 - - -class DataCollator: - def __call__(self, features: List[dict]) -> Dict[str, torch.Tensor]: - bbox = [] - labels = [] - input_ids = [] - attention_mask = [] - - # clip bbox and labels to max length, build input_ids and attention_mask - for feature in features: - _bbox = feature["source_boxes"] - if len(_bbox) > MAX_LEN: - _bbox = _bbox[:MAX_LEN] - _labels = feature["target_index"] - if len(_labels) > MAX_LEN: - _labels = _labels[:MAX_LEN] - _input_ids = [UNK_TOKEN_ID] * len(_bbox) - _attention_mask = [1] * len(_bbox) - assert len(_bbox) == len(_labels) == len(_input_ids) == len(_attention_mask) - bbox.append(_bbox) - labels.append(_labels) - input_ids.append(_input_ids) - attention_mask.append(_attention_mask) - - # add CLS and EOS tokens - for i in range(len(bbox)): - bbox[i] = [[0, 0, 0, 0]] + bbox[i] + [[0, 0, 0, 0]] - labels[i] = [-100] + labels[i] + [-100] - input_ids[i] = [CLS_TOKEN_ID] + input_ids[i] + [EOS_TOKEN_ID] - attention_mask[i] = [1] + attention_mask[i] + [1] - - # padding to max length - max_len = max(len(x) for x in bbox) - for i in range(len(bbox)): - bbox[i] = bbox[i] + [[0, 0, 0, 0]] * (max_len - len(bbox[i])) - labels[i] = labels[i] + [-100] * (max_len - len(labels[i])) - input_ids[i] = input_ids[i] + [EOS_TOKEN_ID] * (max_len - len(input_ids[i])) - attention_mask[i] = attention_mask[i] + [0] * ( - max_len - len(attention_mask[i]) - ) - - ret = { - "bbox": torch.tensor(bbox), - "attention_mask": torch.tensor(attention_mask), - "labels": torch.tensor(labels), - "input_ids": torch.tensor(input_ids), - } - # set label > MAX_LEN to -100, because original labels may be > MAX_LEN - ret["labels"][ret["labels"] > MAX_LEN] = -100 - # set label > 0 to label-1, because original labels are 1-indexed - ret["labels"][ret["labels"] > 0] -= 1 - return ret - - -def boxes2inputs(boxes: List[List[int]]) -> Dict[str, torch.Tensor]: - bbox = [[0, 0, 0, 0]] + boxes + [[0, 0, 0, 0]] - input_ids = [CLS_TOKEN_ID] + [UNK_TOKEN_ID] * len(boxes) + [EOS_TOKEN_ID] - attention_mask = [1] + [1] * len(boxes) + [1] - return { - "bbox": torch.tensor([bbox]), - "attention_mask": torch.tensor([attention_mask]), - "input_ids": torch.tensor([input_ids]), - } - - -def prepare_inputs( - inputs: Dict[str, torch.Tensor], model: LayoutLMv3ForTokenClassification -) -> Dict[str, torch.Tensor]: - ret = {} - for k, v in inputs.items(): - v = v.to(model.device) - if torch.is_floating_point(v): - v = v.to(model.dtype) - ret[k] = v - return ret - - -def parse_logits(logits: torch.Tensor, length: int) -> List[int]: - """ - parse logits to orders - - :param logits: logits from model - :param length: input length - :return: orders - """ - logits = logits[1 : length + 1, :length] - orders = logits.argsort(descending=False).tolist() - ret = [o.pop() for o in orders] - while True: - order_to_idxes = defaultdict(list) - for idx, order in enumerate(ret): - order_to_idxes[order].append(idx) - # filter idxes len > 1 - order_to_idxes = {k: v for k, v in order_to_idxes.items() if len(v) > 1} - if not order_to_idxes: - break - # filter - for order, idxes in order_to_idxes.items(): - # find original logits of idxes - idxes_to_logit = {} - for idx in idxes: - idxes_to_logit[idx] = logits[idx, order] - idxes_to_logit = sorted( - idxes_to_logit.items(), key=lambda x: x[1], reverse=True - ) - # keep the highest logit as order, set others to next candidate - for idx, _ in idxes_to_logit[1:]: - ret[idx] = orders[idx].pop() - - return ret - - -def check_duplicate(a: List[int]) -> bool: - return len(a) != len(set(a)) diff --git a/magic_pdf/model/sub_modules/reading_oreder/layoutreader/xycut.py b/magic_pdf/model/sub_modules/reading_oreder/layoutreader/xycut.py deleted file mode 100644 index 7a36f527673f7ba830a768027ae4b7e1659f4b5f..0000000000000000000000000000000000000000 --- a/magic_pdf/model/sub_modules/reading_oreder/layoutreader/xycut.py +++ /dev/null @@ -1,242 +0,0 @@ -from typing import List -import cv2 -import numpy as np - - -def projection_by_bboxes(boxes: np.array, axis: int) -> np.ndarray: - """ - 通过一组 bbox 获得投影直方图,最后以 per-pixel 形式输出 - - Args: - boxes: [N, 4] - axis: 0-x坐标向水平方向投影, 1-y坐标向垂直方向投影 - - Returns: - 1D 投影直方图,长度为投影方向坐标的最大值(我们不需要图片的实际边长,因为只是要找文本框的间隔) - - """ - assert axis in [0, 1] - length = np.max(boxes[:, axis::2]) - res = np.zeros(length, dtype=int) - # TODO: how to remove for loop? - for start, end in boxes[:, axis::2]: - res[start:end] += 1 - return res - - -# from: https://dothinking.github.io/2021-06-19-%E9%80%92%E5%BD%92%E6%8A%95%E5%BD%B1%E5%88%86%E5%89%B2%E7%AE%97%E6%B3%95/#:~:text=%E9%80%92%E5%BD%92%E6%8A%95%E5%BD%B1%E5%88%86%E5%89%B2%EF%BC%88Recursive%20XY,%EF%BC%8C%E5%8F%AF%E4%BB%A5%E5%88%92%E5%88%86%E6%AE%B5%E8%90%BD%E3%80%81%E8%A1%8C%E3%80%82 -def split_projection_profile(arr_values: np.array, min_value: float, min_gap: float): - """Split projection profile: - - ``` - ┌──┐ - arr_values │ │ ┌─┐─── - ┌──┐ │ │ │ │ | - │ │ │ │ ┌───┐ │ │min_value - │ │<- min_gap ->│ │ │ │ │ │ | - ────┴──┴─────────────┴──┴─┴───┴─┴─┴─┴─── - 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 - ``` - - Args: - arr_values (np.array): 1-d array representing the projection profile. - min_value (float): Ignore the profile if `arr_value` is less than `min_value`. - min_gap (float): Ignore the gap if less than this value. - - Returns: - tuple: Start indexes and end indexes of split groups. - """ - # all indexes with projection height exceeding the threshold - arr_index = np.where(arr_values > min_value)[0] - if not len(arr_index): - return - - # find zero intervals between adjacent projections - # | | || - # ||||<- zero-interval -> ||||| - arr_diff = arr_index[1:] - arr_index[0:-1] - arr_diff_index = np.where(arr_diff > min_gap)[0] - arr_zero_intvl_start = arr_index[arr_diff_index] - arr_zero_intvl_end = arr_index[arr_diff_index + 1] - - # convert to index of projection range: - # the start index of zero interval is the end index of projection - arr_start = np.insert(arr_zero_intvl_end, 0, arr_index[0]) - arr_end = np.append(arr_zero_intvl_start, arr_index[-1]) - arr_end += 1 # end index will be excluded as index slice - - return arr_start, arr_end - - -def recursive_xy_cut(boxes: np.ndarray, indices: List[int], res: List[int]): - """ - - Args: - boxes: (N, 4) - indices: 递归过程中始终表示 box 在原始数据中的索引 - res: 保存输出结果 - - """ - # 向 y 轴投影 - assert len(boxes) == len(indices) - - _indices = boxes[:, 1].argsort() - y_sorted_boxes = boxes[_indices] - y_sorted_indices = indices[_indices] - - # debug_vis(y_sorted_boxes, y_sorted_indices) - - y_projection = projection_by_bboxes(boxes=y_sorted_boxes, axis=1) - pos_y = split_projection_profile(y_projection, 0, 1) - if not pos_y: - return - - arr_y0, arr_y1 = pos_y - for r0, r1 in zip(arr_y0, arr_y1): - # [r0, r1] 表示按照水平切分,有 bbox 的区域,对这些区域会再进行垂直切分 - _indices = (r0 <= y_sorted_boxes[:, 1]) & (y_sorted_boxes[:, 1] < r1) - - y_sorted_boxes_chunk = y_sorted_boxes[_indices] - y_sorted_indices_chunk = y_sorted_indices[_indices] - - _indices = y_sorted_boxes_chunk[:, 0].argsort() - x_sorted_boxes_chunk = y_sorted_boxes_chunk[_indices] - x_sorted_indices_chunk = y_sorted_indices_chunk[_indices] - - # 往 x 方向投影 - x_projection = projection_by_bboxes(boxes=x_sorted_boxes_chunk, axis=0) - pos_x = split_projection_profile(x_projection, 0, 1) - if not pos_x: - continue - - arr_x0, arr_x1 = pos_x - if len(arr_x0) == 1: - # x 方向无法切分 - res.extend(x_sorted_indices_chunk) - continue - - # x 方向上能分开,继续递归调用 - for c0, c1 in zip(arr_x0, arr_x1): - _indices = (c0 <= x_sorted_boxes_chunk[:, 0]) & ( - x_sorted_boxes_chunk[:, 0] < c1 - ) - recursive_xy_cut( - x_sorted_boxes_chunk[_indices], x_sorted_indices_chunk[_indices], res - ) - - -def points_to_bbox(points): - assert len(points) == 8 - - # [x1,y1,x2,y2,x3,y3,x4,y4] - left = min(points[::2]) - right = max(points[::2]) - top = min(points[1::2]) - bottom = max(points[1::2]) - - left = max(left, 0) - top = max(top, 0) - right = max(right, 0) - bottom = max(bottom, 0) - return [left, top, right, bottom] - - -def bbox2points(bbox): - left, top, right, bottom = bbox - return [left, top, right, top, right, bottom, left, bottom] - - -def vis_polygon(img, points, thickness=2, color=None): - br2bl_color = color - tl2tr_color = color - tr2br_color = color - bl2tl_color = color - cv2.line( - img, - (points[0][0], points[0][1]), - (points[1][0], points[1][1]), - color=tl2tr_color, - thickness=thickness, - ) - - cv2.line( - img, - (points[1][0], points[1][1]), - (points[2][0], points[2][1]), - color=tr2br_color, - thickness=thickness, - ) - - cv2.line( - img, - (points[2][0], points[2][1]), - (points[3][0], points[3][1]), - color=br2bl_color, - thickness=thickness, - ) - - cv2.line( - img, - (points[3][0], points[3][1]), - (points[0][0], points[0][1]), - color=bl2tl_color, - thickness=thickness, - ) - return img - - -def vis_points( - img: np.ndarray, points, texts: List[str] = None, color=(0, 200, 0) -) -> np.ndarray: - """ - - Args: - img: - points: [N, 8] 8: x1,y1,x2,y2,x3,y3,x3,y4 - texts: - color: - - Returns: - - """ - points = np.array(points) - if texts is not None: - assert len(texts) == points.shape[0] - - for i, _points in enumerate(points): - vis_polygon(img, _points.reshape(-1, 2), thickness=2, color=color) - bbox = points_to_bbox(_points) - left, top, right, bottom = bbox - cx = (left + right) // 2 - cy = (top + bottom) // 2 - - txt = texts[i] - font = cv2.FONT_HERSHEY_SIMPLEX - cat_size = cv2.getTextSize(txt, font, 0.5, 2)[0] - - img = cv2.rectangle( - img, - (cx - 5 * len(txt), cy - cat_size[1] - 5), - (cx - 5 * len(txt) + cat_size[0], cy - 5), - color, - -1, - ) - - img = cv2.putText( - img, - txt, - (cx - 5 * len(txt), cy - 5), - font, - 0.5, - (255, 255, 255), - thickness=1, - lineType=cv2.LINE_AA, - ) - - return img - - -def vis_polygons_with_index(image, points): - texts = [str(i) for i in range(len(points))] - res_img = vis_points(image.copy(), points, texts) - return res_img \ No newline at end of file diff --git a/magic_pdf/model/sub_modules/table/__init__.py b/magic_pdf/model/sub_modules/table/__init__.py deleted file mode 100644 index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000 diff --git a/magic_pdf/model/sub_modules/table/rapidtable/__init__.py b/magic_pdf/model/sub_modules/table/rapidtable/__init__.py deleted file mode 100644 index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000 diff --git a/magic_pdf/model/sub_modules/table/rapidtable/rapid_table.py b/magic_pdf/model/sub_modules/table/rapidtable/rapid_table.py deleted file mode 100644 index b698b3efb3c7ce57f2b526f6c88a1b0d04a0fd35..0000000000000000000000000000000000000000 --- a/magic_pdf/model/sub_modules/table/rapidtable/rapid_table.py +++ /dev/null @@ -1,103 +0,0 @@ -import os -from pathlib import Path -import cv2 -import numpy as np -import torch -from loguru import logger -from rapid_table import RapidTable, RapidTableInput -from rapid_table.main import ModelType - -from magic_pdf.libs.config_reader import get_device - - -class RapidTableModel(object): - def __init__(self, ocr_engine, table_sub_model_name='slanet_plus'): - sub_model_list = [model.value for model in ModelType] - if table_sub_model_name is None: - input_args = RapidTableInput() - elif table_sub_model_name in sub_model_list: - if torch.cuda.is_available() and table_sub_model_name == "unitable": - input_args = RapidTableInput(model_type=table_sub_model_name, use_cuda=True, device=get_device()) - else: - root_dir = Path(__file__).absolute().parent.parent.parent.parent.parent - slanet_plus_model_path = os.path.join(root_dir, 'resources', 'slanet_plus', 'slanet-plus.onnx') - input_args = RapidTableInput(model_type=table_sub_model_name, model_path=slanet_plus_model_path) - else: - raise ValueError(f"Invalid table_sub_model_name: {table_sub_model_name}. It must be one of {sub_model_list}") - - self.table_model = RapidTable(input_args) - - # self.ocr_model_name = "RapidOCR" - # if torch.cuda.is_available(): - # from rapidocr_paddle import RapidOCR - # self.ocr_engine = RapidOCR(det_use_cuda=True, cls_use_cuda=True, rec_use_cuda=True) - # else: - # from rapidocr_onnxruntime import RapidOCR - # self.ocr_engine = RapidOCR() - - # self.ocr_model_name = "PaddleOCR" - self.ocr_engine = ocr_engine - - - def predict(self, image): - bgr_image = cv2.cvtColor(np.asarray(image), cv2.COLOR_RGB2BGR) - - # First check the overall image aspect ratio (height/width) - img_height, img_width = bgr_image.shape[:2] - img_aspect_ratio = img_height / img_width if img_width > 0 else 1.0 - img_is_portrait = img_aspect_ratio > 1.2 - - if img_is_portrait: - - det_res = self.ocr_engine.ocr(bgr_image, rec=False)[0] - # Check if table is rotated by analyzing text box aspect ratios - is_rotated = False - if det_res: - vertical_count = 0 - - for box_ocr_res in det_res: - p1, p2, p3, p4 = box_ocr_res - - # Calculate width and height - width = p3[0] - p1[0] - height = p3[1] - p1[1] - - aspect_ratio = width / height if height > 0 else 1.0 - - # Count vertical vs horizontal text boxes - if aspect_ratio < 0.8: # Taller than wide - vertical text - vertical_count += 1 - # elif aspect_ratio > 1.2: # Wider than tall - horizontal text - # horizontal_count += 1 - - # If we have more vertical text boxes than horizontal ones, - # and vertical ones are significant, table might be rotated - if vertical_count >= len(det_res) * 0.3: - is_rotated = True - - # logger.debug(f"Text orientation analysis: vertical={vertical_count}, det_res={len(det_res)}, rotated={is_rotated}") - - # Rotate image if necessary - if is_rotated: - # logger.debug("Table appears to be in portrait orientation, rotating 90 degrees clockwise") - image = cv2.rotate(np.asarray(image), cv2.ROTATE_90_CLOCKWISE) - bgr_image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR) - - # Continue with OCR on potentially rotated image - ocr_result = self.ocr_engine.ocr(bgr_image)[0] - if ocr_result: - ocr_result = [[item[0], item[1][0], item[1][1]] for item in ocr_result if - len(item) == 2 and isinstance(item[1], tuple)] - else: - ocr_result = None - - - if ocr_result: - table_results = self.table_model(np.asarray(image), ocr_result) - html_code = table_results.pred_html - table_cell_bboxes = table_results.cell_bboxes - logic_points = table_results.logic_points - elapse = table_results.elapse - return html_code, table_cell_bboxes, logic_points, elapse - else: - return None, None, None, None diff --git a/magic_pdf/model/sub_modules/table/table_utils.py b/magic_pdf/model/sub_modules/table/table_utils.py deleted file mode 100644 index f04bf98d5d14c6bd69184eac94a54a88b3ad50e7..0000000000000000000000000000000000000000 --- a/magic_pdf/model/sub_modules/table/table_utils.py +++ /dev/null @@ -1,11 +0,0 @@ -import re - - -def minify_html(html): - # 移除多余的空白字符 - html = re.sub(r'\s+', ' ', html) - # 移除行尾的空白字符 - html = re.sub(r'\s*>\s*', '>', html) - # 移除标签前的空白字符 - html = re.sub(r'\s*<\s*', '<', html) - return html.strip() \ No newline at end of file diff --git a/magic_pdf/operators/__init__.py b/magic_pdf/operators/__init__.py deleted file mode 100644 index 84ae24aefa4153ff32b5cc540da1b730ad927c6a..0000000000000000000000000000000000000000 --- a/magic_pdf/operators/__init__.py +++ /dev/null @@ -1,94 +0,0 @@ -from abc import ABC, abstractmethod -from typing import Callable - -from magic_pdf.data.data_reader_writer import DataWriter -from magic_pdf.data.dataset import Dataset -from magic_pdf.operators.pipes import PipeResult - - -class InferenceResultBase(ABC): - - @abstractmethod - def __init__(self, inference_results: list, dataset: Dataset): - """Initialized method. - - Args: - inference_results (list): the inference result generated by model - dataset (Dataset): the dataset related with model inference result - """ - pass - - @abstractmethod - def draw_model(self, file_path: str) -> None: - """Draw model inference result. - - Args: - file_path (str): the output file path - """ - pass - - @abstractmethod - def dump_model(self, writer: DataWriter, file_path: str): - """Dump model inference result to file. - - Args: - writer (DataWriter): writer handle - file_path (str): the location of target file - """ - pass - - @abstractmethod - def get_infer_res(self): - """Get the inference result. - - Returns: - list: the inference result generated by model - """ - pass - - @abstractmethod - def apply(self, proc: Callable, *args, **kwargs): - """Apply callable method which. - - Args: - proc (Callable): invoke proc as follows: - proc(inference_result, *args, **kwargs) - - Returns: - Any: return the result generated by proc - """ - pass - - def pipe_txt_mode( - self, - imageWriter: DataWriter, - start_page_id=0, - end_page_id=None, - debug_mode=False, - lang=None, - ) -> PipeResult: - """Post-proc the model inference result, Extract the text using the - third library, such as `pymupdf` - - Args: - imageWriter (DataWriter): the image writer handle - start_page_id (int, optional): Defaults to 0. Let user select some pages He/She want to process - end_page_id (int, optional): Defaults to the last page index of dataset. Let user select some pages He/She want to process - debug_mode (bool, optional): Defaults to False. will dump more log if enabled - lang (str, optional): Defaults to None. - - Returns: - PipeResult: the result - """ - pass - - @abstractmethod - def pipe_ocr_mode( - self, - imageWriter: DataWriter, - start_page_id=0, - end_page_id=None, - debug_mode=False, - lang=None, - ) -> PipeResult: - pass diff --git a/magic_pdf/operators/models.py b/magic_pdf/operators/models.py deleted file mode 100644 index 34cbfe4bd1c804c4e9ecc3888cc6805948d2f164..0000000000000000000000000000000000000000 --- a/magic_pdf/operators/models.py +++ /dev/null @@ -1,154 +0,0 @@ -import copy -import json -import os -from typing import Callable - -from magic_pdf.config.constants import PARSE_TYPE_OCR, PARSE_TYPE_TXT -from magic_pdf.config.enums import SupportedPdfParseMethod -from magic_pdf.data.data_reader_writer import DataWriter -from magic_pdf.data.dataset import Dataset -from magic_pdf.libs.draw_bbox import draw_model_bbox -from magic_pdf.libs.version import __version__ -from magic_pdf.operators.pipes import PipeResult -from magic_pdf.pdf_parse_union_core_v2 import pdf_parse_union -from magic_pdf.operators import InferenceResultBase - -class InferenceResult(InferenceResultBase): - def __init__(self, inference_results: list, dataset: Dataset): - """Initialized method. - - Args: - inference_results (list): the inference result generated by model - dataset (Dataset): the dataset related with model inference result - """ - self._infer_res = inference_results - self._dataset = dataset - - def draw_model(self, file_path: str) -> None: - """Draw model inference result. - - Args: - file_path (str): the output file path - """ - dir_name = os.path.dirname(file_path) - base_name = os.path.basename(file_path) - if not os.path.exists(dir_name): - os.makedirs(dir_name, exist_ok=True) - draw_model_bbox( - copy.deepcopy(self._infer_res), self._dataset, dir_name, base_name - ) - - def dump_model(self, writer: DataWriter, file_path: str): - """Dump model inference result to file. - - Args: - writer (DataWriter): writer handle - file_path (str): the location of target file - """ - writer.write_string( - file_path, json.dumps(self._infer_res, ensure_ascii=False, indent=4) - ) - - def get_infer_res(self): - """Get the inference result. - - Returns: - list: the inference result generated by model - """ - return self._infer_res - - def apply(self, proc: Callable, *args, **kwargs): - """Apply callable method which. - - Args: - proc (Callable): invoke proc as follows: - proc(inference_result, *args, **kwargs) - - Returns: - Any: return the result generated by proc - """ - return proc(copy.deepcopy(self._infer_res), *args, **kwargs) - - def pipe_txt_mode( - self, - imageWriter: DataWriter, - start_page_id=0, - end_page_id=None, - debug_mode=False, - lang=None, - ) -> PipeResult: - """Post-proc the model inference result, Extract the text using the - third library, such as `pymupdf` - - Args: - imageWriter (DataWriter): the image writer handle - start_page_id (int, optional): Defaults to 0. Let user select some pages He/She want to process - end_page_id (int, optional): Defaults to the last page index of dataset. Let user select some pages He/She want to process - debug_mode (bool, optional): Defaults to False. will dump more log if enabled - lang (str, optional): Defaults to None. - - Returns: - PipeResult: the result - """ - - def proc(*args, **kwargs) -> PipeResult: - res = pdf_parse_union(*args, **kwargs) - res['_parse_type'] = PARSE_TYPE_TXT - res['_version_name'] = __version__ - if 'lang' in kwargs and kwargs['lang'] is not None: - res['lang'] = kwargs['lang'] - return PipeResult(res, self._dataset) - - res = self.apply( - proc, - self._dataset, - imageWriter, - SupportedPdfParseMethod.TXT, - start_page_id=start_page_id, - end_page_id=end_page_id, - debug_mode=debug_mode, - lang=lang, - ) - return res - - def pipe_ocr_mode( - self, - imageWriter: DataWriter, - start_page_id=0, - end_page_id=None, - debug_mode=False, - lang=None, - ) -> PipeResult: - """Post-proc the model inference result, Extract the text using `OCR` - technical. - - Args: - imageWriter (DataWriter): the image writer handle - start_page_id (int, optional): Defaults to 0. Let user select some pages He/She want to process - end_page_id (int, optional): Defaults to the last page index of dataset. Let user select some pages He/She want to process - debug_mode (bool, optional): Defaults to False. will dump more log if enabled - lang (str, optional): Defaults to None. - - Returns: - PipeResult: the result - """ - - def proc(*args, **kwargs) -> PipeResult: - res = pdf_parse_union(*args, **kwargs) - res['_parse_type'] = PARSE_TYPE_OCR - res['_version_name'] = __version__ - if 'lang' in kwargs and kwargs['lang'] is not None: - res['lang'] = kwargs['lang'] - return PipeResult(res, self._dataset) - - res = self.apply( - proc, - self._dataset, - imageWriter, - SupportedPdfParseMethod.OCR, - start_page_id=start_page_id, - end_page_id=end_page_id, - debug_mode=debug_mode, - lang=lang, - ) - return res diff --git a/magic_pdf/operators/pipes.py b/magic_pdf/operators/pipes.py deleted file mode 100644 index 8a9f7a563682d5271017550d4753ec3d045e6d43..0000000000000000000000000000000000000000 --- a/magic_pdf/operators/pipes.py +++ /dev/null @@ -1,191 +0,0 @@ -import copy -import json -import os -from typing import Callable - -from magic_pdf.config.make_content_config import DropMode, MakeMode -from magic_pdf.data.data_reader_writer import DataWriter -from magic_pdf.data.dataset import Dataset -from magic_pdf.dict2md.ocr_mkcontent import union_make -from magic_pdf.libs.draw_bbox import (draw_layout_bbox, draw_line_sort_bbox, - draw_span_bbox) -from magic_pdf.libs.json_compressor import JsonCompressor - - -class PipeResult: - def __init__(self, pipe_res, dataset: Dataset): - """Initialized. - - Args: - pipe_res (list[dict]): the pipeline processed result of model inference result - dataset (Dataset): the dataset associated with pipe_res - """ - self._pipe_res = pipe_res - self._dataset = dataset - - def get_markdown( - self, - img_dir_or_bucket_prefix: str, - drop_mode=DropMode.NONE, - md_make_mode=MakeMode.MM_MD, - ) -> str: - """Get markdown content. - - Args: - img_dir_or_bucket_prefix (str): The s3 bucket prefix or local file directory which used to store the figure - drop_mode (str, optional): Drop strategy when some page which is corrupted or inappropriate. Defaults to DropMode.NONE. - md_make_mode (str, optional): The content Type of Markdown be made. Defaults to MakeMode.MM_MD. - - Returns: - str: return markdown content - """ - pdf_info_list = self._pipe_res['pdf_info'] - md_content = union_make( - pdf_info_list, md_make_mode, drop_mode, img_dir_or_bucket_prefix - ) - return md_content - - def dump_md( - self, - writer: DataWriter, - file_path: str, - img_dir_or_bucket_prefix: str, - drop_mode=DropMode.NONE, - md_make_mode=MakeMode.MM_MD, - ): - """Dump The Markdown. - - Args: - writer (DataWriter): File writer handle - file_path (str): The file location of markdown - img_dir_or_bucket_prefix (str): The s3 bucket prefix or local file directory which used to store the figure - drop_mode (str, optional): Drop strategy when some page which is corrupted or inappropriate. Defaults to DropMode.NONE. - md_make_mode (str, optional): The content Type of Markdown be made. Defaults to MakeMode.MM_MD. - """ - - md_content = self.get_markdown( - img_dir_or_bucket_prefix, drop_mode=drop_mode, md_make_mode=md_make_mode - ) - writer.write_string(file_path, md_content) - - def get_content_list( - self, - image_dir_or_bucket_prefix: str, - drop_mode=DropMode.NONE, - ) -> str: - """Get Content List. - - Args: - image_dir_or_bucket_prefix (str): The s3 bucket prefix or local file directory which used to store the figure - drop_mode (str, optional): Drop strategy when some page which is corrupted or inappropriate. Defaults to DropMode.NONE. - - Returns: - str: content list content - """ - pdf_info_list = self._pipe_res['pdf_info'] - content_list = union_make( - pdf_info_list, - MakeMode.STANDARD_FORMAT, - drop_mode, - image_dir_or_bucket_prefix, - ) - return content_list - - def dump_content_list( - self, - writer: DataWriter, - file_path: str, - image_dir_or_bucket_prefix: str, - drop_mode=DropMode.NONE, - ): - """Dump Content List. - - Args: - writer (DataWriter): File writer handle - file_path (str): The file location of content list - image_dir_or_bucket_prefix (str): The s3 bucket prefix or local file directory which used to store the figure - drop_mode (str, optional): Drop strategy when some page which is corrupted or inappropriate. Defaults to DropMode.NONE. - """ - content_list = self.get_content_list( - image_dir_or_bucket_prefix, drop_mode=drop_mode, - ) - writer.write_string( - file_path, json.dumps(content_list, ensure_ascii=False, indent=4) - ) - - def get_middle_json(self) -> str: - """Get middle json. - - Returns: - str: The content of middle json - """ - return json.dumps(self._pipe_res, ensure_ascii=False, indent=4) - - def dump_middle_json(self, writer: DataWriter, file_path: str): - """Dump the result of pipeline. - - Args: - writer (DataWriter): File writer handler - file_path (str): The file location of middle json - """ - middle_json = self.get_middle_json() - writer.write_string(file_path, middle_json) - - def draw_layout(self, file_path: str) -> None: - """Draw the layout. - - Args: - file_path (str): The file location of layout result file - """ - dir_name = os.path.dirname(file_path) - base_name = os.path.basename(file_path) - if not os.path.exists(dir_name): - os.makedirs(dir_name, exist_ok=True) - pdf_info = self._pipe_res['pdf_info'] - draw_layout_bbox(pdf_info, self._dataset.data_bits(), dir_name, base_name) - - def draw_span(self, file_path: str): - """Draw the Span. - - Args: - file_path (str): The file location of span result file - """ - dir_name = os.path.dirname(file_path) - base_name = os.path.basename(file_path) - if not os.path.exists(dir_name): - os.makedirs(dir_name, exist_ok=True) - pdf_info = self._pipe_res['pdf_info'] - draw_span_bbox(pdf_info, self._dataset.data_bits(), dir_name, base_name) - - def draw_line_sort(self, file_path: str): - """Draw line sort. - - Args: - file_path (str): The file location of line sort result file - """ - dir_name = os.path.dirname(file_path) - base_name = os.path.basename(file_path) - if not os.path.exists(dir_name): - os.makedirs(dir_name, exist_ok=True) - pdf_info = self._pipe_res['pdf_info'] - draw_line_sort_bbox(pdf_info, self._dataset.data_bits(), dir_name, base_name) - - def get_compress_pdf_mid_data(self): - """Compress the pipeline result. - - Returns: - str: compress the pipeline result and return - """ - return JsonCompressor.compress_json(self._pipe_res) - - def apply(self, proc: Callable, *args, **kwargs): - """Apply callable method which. - - Args: - proc (Callable): invoke proc as follows: - proc(pipeline_result, *args, **kwargs) - - Returns: - Any: return the result generated by proc - """ - return proc(copy.deepcopy(self._pipe_res), *args, **kwargs) diff --git a/magic_pdf/pdf_parse_union_core_v2.py b/magic_pdf/pdf_parse_union_core_v2.py deleted file mode 100644 index 2ca8fa11f8bb54712be1647a048090dc79257a35..0000000000000000000000000000000000000000 --- a/magic_pdf/pdf_parse_union_core_v2.py +++ /dev/null @@ -1,1049 +0,0 @@ -import copy -import math -import os -import re -import statistics -import time -import warnings -from typing import List - -import cv2 -import fitz -import torch -import numpy as np -from loguru import logger -from tqdm import tqdm - -from magic_pdf.config.enums import SupportedPdfParseMethod -from magic_pdf.config.ocr_content_type import BlockType, ContentType -from magic_pdf.data.dataset import Dataset, PageableData -from magic_pdf.libs.boxbase import calculate_overlap_area_in_bbox1_area_ratio, __is_overlaps_y_exceeds_threshold -from magic_pdf.libs.clean_memory import clean_memory -from magic_pdf.libs.config_reader import get_local_layoutreader_model_dir, get_llm_aided_config, get_device -from magic_pdf.libs.convert_utils import dict_to_list -from magic_pdf.libs.hash_utils import compute_md5 -from magic_pdf.libs.pdf_image_tools import cut_image_to_pil_image -from magic_pdf.model.magic_model import MagicModel -from magic_pdf.post_proc.llm_aided import llm_aided_formula, llm_aided_text, llm_aided_title - -from magic_pdf.model.sub_modules.model_init import AtomModelSingleton -from magic_pdf.post_proc.para_split_v3 import para_split -from magic_pdf.pre_proc.construct_page_dict import ocr_construct_page_component_v2 -from magic_pdf.pre_proc.cut_image import ocr_cut_image_and_table -from magic_pdf.pre_proc.ocr_detect_all_bboxes import ocr_prepare_bboxes_for_layout_split_v2 -from magic_pdf.pre_proc.ocr_dict_merge import fill_spans_in_blocks, fix_block_spans_v2, fix_discarded_block -from magic_pdf.pre_proc.ocr_span_list_modify import get_qa_need_list_v2, remove_overlaps_low_confidence_spans, \ - remove_overlaps_min_spans, remove_x_overlapping_chars - -os.environ['NO_ALBUMENTATIONS_UPDATE'] = '1' # 禁止albumentations检查更新 - - -def __replace_STX_ETX(text_str: str): - """Replace \u0002 and \u0003, as these characters become garbled when extracted using pymupdf. In fact, they were originally quotation marks. - Drawback: This issue is only observed in English text; it has not been found in Chinese text so far. - - Args: - text_str (str): raw text - - Returns: - _type_: replaced text - """ # noqa: E501 - if text_str: - s = text_str.replace('\u0002', "'") - s = s.replace('\u0003', "'") - return s - return text_str - - -# 连写字符拆分 -def __replace_ligatures(text: str): - ligatures = { - 'fi': 'fi', 'fl': 'fl', 'ff': 'ff', 'ffi': 'ffi', 'ffl': 'ffl', 'ſt': 'ft', 'st': 'st' - } - return re.sub('|'.join(map(re.escape, ligatures.keys())), lambda m: ligatures[m.group()], text) - - -def chars_to_content(span): - # 检查span中的char是否为空 - if len(span['chars']) == 0: - pass - else: - # 先给chars按char['bbox']的中心点的x坐标排序 - span['chars'] = sorted(span['chars'], key=lambda x: (x['bbox'][0] + x['bbox'][2]) / 2) - - # Calculate the width of each character - char_widths = [char['bbox'][2] - char['bbox'][0] for char in span['chars']] - # Calculate the median width - median_width = statistics.median(char_widths) - - # 通过x轴重叠比率移除一部分char - span = remove_x_overlapping_chars(span, median_width) - - content = '' - for char in span['chars']: - - # 如果下一个char的x0和上一个char的x1距离超过0.25个字符宽度,则需要在中间插入一个空格 - char1 = char - char2 = span['chars'][span['chars'].index(char) + 1] if span['chars'].index(char) + 1 < len(span['chars']) else None - if char2 and char2['bbox'][0] - char1['bbox'][2] > median_width * 0.25 and char['c'] != ' ' and char2['c'] != ' ': - content += f"{char['c']} " - else: - content += char['c'] - - span['content'] = __replace_ligatures(content) - - del span['chars'] - - -LINE_STOP_FLAG = ('.', '!', '?', '。', '!', '?', ')', ')', '"', '”', ':', ':', ';', ';', ']', '】', '}', '}', '>', '》', '、', ',', ',', '-', '—', '–',) -LINE_START_FLAG = ('(', '(', '"', '“', '【', '{', '《', '<', '「', '『', '【', '[',) - - -def fill_char_in_spans(spans, all_chars): - - # 简单从上到下排一下序 - spans = sorted(spans, key=lambda x: x['bbox'][1]) - - for char in all_chars: - - for span in spans: - if calculate_char_in_span(char['bbox'], span['bbox'], char['c']): - span['chars'].append(char) - break - - need_ocr_spans = [] - for span in spans: - chars_to_content(span) - # 有的span中虽然没有字但有一两个空的占位符,用宽高和content长度过滤 - if len(span['content']) * span['height'] < span['width'] * 0.5: - # logger.info(f"maybe empty span: {len(span['content'])}, {span['height']}, {span['width']}") - need_ocr_spans.append(span) - del span['height'], span['width'] - return need_ocr_spans - - -# 使用鲁棒性更强的中心点坐标判断 -def calculate_char_in_span(char_bbox, span_bbox, char, span_height_radio=0.33): - char_center_x = (char_bbox[0] + char_bbox[2]) / 2 - char_center_y = (char_bbox[1] + char_bbox[3]) / 2 - span_center_y = (span_bbox[1] + span_bbox[3]) / 2 - span_height = span_bbox[3] - span_bbox[1] - - if ( - span_bbox[0] < char_center_x < span_bbox[2] - and span_bbox[1] < char_center_y < span_bbox[3] - and abs(char_center_y - span_center_y) < span_height * span_height_radio # 字符的中轴和span的中轴高度差不能超过1/4span高度 - ): - return True - else: - # 如果char是LINE_STOP_FLAG,就不用中心点判定,换一种方案(左边界在span区域内,高度判定和之前逻辑一致) - # 主要是给结尾符号一个进入span的机会,这个char还应该离span右边界较近 - if char in LINE_STOP_FLAG: - if ( - (span_bbox[2] - span_height) < char_bbox[0] < span_bbox[2] - and char_center_x > span_bbox[0] - and span_bbox[1] < char_center_y < span_bbox[3] - and abs(char_center_y - span_center_y) < span_height * span_height_radio - ): - return True - elif char in LINE_START_FLAG: - if ( - span_bbox[0] < char_bbox[2] < (span_bbox[0] + span_height) - and char_center_x < span_bbox[2] - and span_bbox[1] < char_center_y < span_bbox[3] - and abs(char_center_y - span_center_y) < span_height * span_height_radio - ): - return True - else: - return False - - -def remove_tilted_line(text_blocks): - for block in text_blocks: - remove_lines = [] - for line in block['lines']: - cosine, sine = line['dir'] - # 计算弧度值 - angle_radians = math.atan2(sine, cosine) - # 将弧度值转换为角度值 - angle_degrees = math.degrees(angle_radians) - if 2 < abs(angle_degrees) < 88: - remove_lines.append(line) - for line in remove_lines: - block['lines'].remove(line) - - -def calculate_contrast(img, img_mode) -> float: - """ - 计算给定图像的对比度。 - :param img: 图像,类型为numpy.ndarray - :Param img_mode = 图像的色彩通道,'rgb' 或 'bgr' - :return: 图像的对比度值 - """ - if img_mode == 'rgb': - # 将RGB图像转换为灰度图 - gray_img = cv2.cvtColor(img, cv2.COLOR_RGB2GRAY) - elif img_mode == 'bgr': - # 将BGR图像转换为灰度图 - gray_img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) - else: - raise ValueError("Invalid image mode. Please provide 'rgb' or 'bgr'.") - - # 计算均值和标准差 - mean_value = np.mean(gray_img) - std_dev = np.std(gray_img) - # 对比度定义为标准差除以平均值(加上小常数避免除零错误) - contrast = std_dev / (mean_value + 1e-6) - # logger.debug(f"contrast: {contrast}") - return round(contrast, 2) - -# @measure_time -def txt_spans_extract_v2(pdf_page, spans, all_bboxes, all_discarded_blocks, lang): - # cid用0xfffd表示,连字符拆开 - # text_blocks_raw = pdf_page.get_text('rawdict', flags=fitz.TEXT_PRESERVE_WHITESPACE | fitz.TEXT_MEDIABOX_CLIP)['blocks'] - - # cid用0xfffd表示,连字符不拆开 - #text_blocks_raw = pdf_page.get_text('rawdict', flags=fitz.TEXT_PRESERVE_LIGATURES | fitz.TEXT_PRESERVE_WHITESPACE | fitz.TEXT_MEDIABOX_CLIP)['blocks'] - - # 自定义flags出现较多0xfffd,可能是pymupdf可以自行处理内置字典的pdf,不再使用 - text_blocks_raw = pdf_page.get_text('rawdict', flags=fitz.TEXTFLAGS_TEXT)['blocks'] - # text_blocks = pdf_page.get_text('dict', flags=fitz.TEXTFLAGS_TEXT)['blocks'] - - # 移除所有角度不为0或90的line - remove_tilted_line(text_blocks_raw) - - all_pymu_chars = [] - for block in text_blocks_raw: - for line in block['lines']: - cosine, sine = line['dir'] - if abs(cosine) < 0.9 or abs(sine) > 0.1: - continue - for span in line['spans']: - all_pymu_chars.extend(span['chars']) - - # 计算所有sapn的高度的中位数 - span_height_list = [] - for span in spans: - if span['type'] in [ContentType.InterlineEquation, ContentType.Image, ContentType.Table]: - continue - span_height = span['bbox'][3] - span['bbox'][1] - span['height'] = span_height - span['width'] = span['bbox'][2] - span['bbox'][0] - span_height_list.append(span_height) - if len(span_height_list) == 0: - return spans - else: - median_span_height = statistics.median(span_height_list) - - useful_spans = [] - unuseful_spans = [] - # 纵向span的两个特征:1. 高度超过多个line 2. 高宽比超过某个值 - vertical_spans = [] - for span in spans: - if span['type'] in [ContentType.InterlineEquation, ContentType.Image, ContentType.Table]: - continue - for block in all_bboxes + all_discarded_blocks: - if block[7] in [BlockType.ImageBody, BlockType.TableBody, BlockType.InterlineEquation]: - continue - if calculate_overlap_area_in_bbox1_area_ratio(span['bbox'], block[0:4]) > 0.5: - if span['height'] > median_span_height * 3 and span['height'] > span['width'] * 3: - vertical_spans.append(span) - elif block in all_bboxes: - useful_spans.append(span) - else: - unuseful_spans.append(span) - - break - - """垂直的span框直接用pymu的line进行填充""" - if len(vertical_spans) > 0: - text_blocks = pdf_page.get_text('dict', flags=fitz.TEXTFLAGS_TEXT)['blocks'] - all_pymu_lines = [] - for block in text_blocks: - for line in block['lines']: - all_pymu_lines.append(line) - - for pymu_line in all_pymu_lines: - for span in vertical_spans: - if calculate_overlap_area_in_bbox1_area_ratio(pymu_line['bbox'], span['bbox']) > 0.5: - for pymu_span in pymu_line['spans']: - span['content'] += pymu_span['text'] - break - - for span in vertical_spans: - if len(span['content']) == 0: - spans.remove(span) - - """水平的span框如果没有char则用ocr进行填充""" - new_spans = [] - - for span in useful_spans + unuseful_spans: - if span['type'] in [ContentType.Text]: - span['chars'] = [] - new_spans.append(span) - - need_ocr_spans = fill_char_in_spans(new_spans, all_pymu_chars) - - if len(need_ocr_spans) > 0: - - # 初始化ocr模型 - # atom_model_manager = AtomModelSingleton() - # ocr_model = atom_model_manager.get_atom_model( - # atom_model_name='ocr', - # ocr_show_log=False, - # det_db_box_thresh=0.3, - # lang=lang - # ) - - for span in need_ocr_spans: - # 对span的bbox截图再ocr - span_img = cut_image_to_pil_image(span['bbox'], pdf_page, mode='cv2') - - # 计算span的对比度,低于0.20的span不进行ocr - if calculate_contrast(span_img, img_mode='bgr') <= 0.17: - spans.remove(span) - continue - # pass - - span['content'] = '' - span['score'] = 1 - span['np_img'] = span_img - - - # ocr_res = ocr_model.ocr(span_img, det=False) - # if ocr_res and len(ocr_res) > 0: - # if len(ocr_res[0]) > 0: - # ocr_text, ocr_score = ocr_res[0][0] - # # logger.info(f"ocr_text: {ocr_text}, ocr_score: {ocr_score}") - # if ocr_score > 0.5 and len(ocr_text) > 0: - # span['content'] = ocr_text - # span['score'] = float(round(ocr_score, 2)) - # else: - # spans.remove(span) - - return spans - - -def model_init(model_name: str): - from transformers import LayoutLMv3ForTokenClassification - device_name = get_device() - bf_16_support = False - if device_name.startswith("cuda"): - bf_16_support = torch.cuda.is_bf16_supported() - elif device_name.startswith("mps"): - bf_16_support = True - - device = torch.device(device_name) - if model_name == 'layoutreader': - # 检测modelscope的缓存目录是否存在 - layoutreader_model_dir = get_local_layoutreader_model_dir() - if os.path.exists(layoutreader_model_dir): - model = LayoutLMv3ForTokenClassification.from_pretrained( - layoutreader_model_dir - ) - else: - logger.warning( - 'local layoutreader model not exists, use online model from huggingface' - ) - model = LayoutLMv3ForTokenClassification.from_pretrained( - 'hantian/layoutreader' - ) - if bf_16_support: - model.to(device).eval().bfloat16() - else: - model.to(device).eval() - else: - logger.error('model name not allow') - exit(1) - return model - - -class ModelSingleton: - _instance = None - _models = {} - - def __new__(cls, *args, **kwargs): - if cls._instance is None: - cls._instance = super().__new__(cls) - return cls._instance - - def get_model(self, model_name: str): - if model_name not in self._models: - self._models[model_name] = model_init(model_name=model_name) - return self._models[model_name] - - -def do_predict(boxes: List[List[int]], model) -> List[int]: - from magic_pdf.model.sub_modules.reading_oreder.layoutreader.helpers import ( - boxes2inputs, parse_logits, prepare_inputs) - - with warnings.catch_warnings(): - warnings.filterwarnings("ignore", category=FutureWarning, module="transformers") - - inputs = boxes2inputs(boxes) - inputs = prepare_inputs(inputs, model) - logits = model(**inputs).logits.cpu().squeeze(0) - return parse_logits(logits, len(boxes)) - - -def cal_block_index(fix_blocks, sorted_bboxes): - - if sorted_bboxes is not None: - # 使用layoutreader排序 - for block in fix_blocks: - line_index_list = [] - if len(block['lines']) == 0: - block['index'] = sorted_bboxes.index(block['bbox']) - else: - for line in block['lines']: - line['index'] = sorted_bboxes.index(line['bbox']) - line_index_list.append(line['index']) - median_value = statistics.median(line_index_list) - block['index'] = median_value - - # 删除图表body block中的虚拟line信息, 并用real_lines信息回填 - if block['type'] in [BlockType.ImageBody, BlockType.TableBody, BlockType.Title, BlockType.InterlineEquation]: - if 'real_lines' in block: - block['virtual_lines'] = copy.deepcopy(block['lines']) - block['lines'] = copy.deepcopy(block['real_lines']) - del block['real_lines'] - else: - # 使用xycut排序 - block_bboxes = [] - for block in fix_blocks: - # 如果block['bbox']任意值小于0,将其置为0 - block['bbox'] = [max(0, x) for x in block['bbox']] - block_bboxes.append(block['bbox']) - - # 删除图表body block中的虚拟line信息, 并用real_lines信息回填 - if block['type'] in [BlockType.ImageBody, BlockType.TableBody, BlockType.Title, BlockType.InterlineEquation]: - if 'real_lines' in block: - block['virtual_lines'] = copy.deepcopy(block['lines']) - block['lines'] = copy.deepcopy(block['real_lines']) - del block['real_lines'] - - import numpy as np - - from magic_pdf.model.sub_modules.reading_oreder.layoutreader.xycut import \ - recursive_xy_cut - - random_boxes = np.array(block_bboxes) - np.random.shuffle(random_boxes) - res = [] - recursive_xy_cut(np.asarray(random_boxes).astype(int), np.arange(len(block_bboxes)), res) - assert len(res) == len(block_bboxes) - sorted_boxes = random_boxes[np.array(res)].tolist() - - for i, block in enumerate(fix_blocks): - block['index'] = sorted_boxes.index(block['bbox']) - - # 生成line index - sorted_blocks = sorted(fix_blocks, key=lambda b: b['index']) - line_inedx = 1 - for block in sorted_blocks: - for line in block['lines']: - line['index'] = line_inedx - line_inedx += 1 - - return fix_blocks - - -def insert_lines_into_block(block_bbox, line_height, page_w, page_h): - # block_bbox是一个元组(x0, y0, x1, y1),其中(x0, y0)是左下角坐标,(x1, y1)是右上角坐标 - x0, y0, x1, y1 = block_bbox - - block_height = y1 - y0 - block_weight = x1 - x0 - - # 如果block高度小于n行正文,则直接返回block的bbox - if line_height * 2 < block_height: - if ( - block_height > page_h * 0.25 and page_w * 0.5 > block_weight > page_w * 0.25 - ): # 可能是双列结构,可以切细点 - lines = int(block_height / line_height) - else: - # 如果block的宽度超过0.4页面宽度,则将block分成3行(是一种复杂布局,图不能切的太细) - if block_weight > page_w * 0.4: - lines = 3 - elif block_weight > page_w * 0.25: # (可能是三列结构,也切细点) - lines = int(block_height / line_height) - else: # 判断长宽比 - if block_height / block_weight > 1.2: # 细长的不分 - return [[x0, y0, x1, y1]] - else: # 不细长的还是分成两行 - lines = 2 - - line_height = (y1 - y0) / lines - - # 确定从哪个y位置开始绘制线条 - current_y = y0 - - # 用于存储线条的位置信息[(x0, y), ...] - lines_positions = [] - - for i in range(lines): - lines_positions.append([x0, current_y, x1, current_y + line_height]) - current_y += line_height - return lines_positions - - else: - return [[x0, y0, x1, y1]] - - -def sort_lines_by_model(fix_blocks, page_w, page_h, line_height, footnote_blocks): - page_line_list = [] - - def add_lines_to_block(b): - line_bboxes = insert_lines_into_block(b['bbox'], line_height, page_w, page_h) - b['lines'] = [] - for line_bbox in line_bboxes: - b['lines'].append({'bbox': line_bbox, 'spans': []}) - page_line_list.extend(line_bboxes) - - for block in fix_blocks: - if block['type'] in [ - BlockType.Text, BlockType.Title, - BlockType.ImageCaption, BlockType.ImageFootnote, - BlockType.TableCaption, BlockType.TableFootnote - ]: - if len(block['lines']) == 0: - add_lines_to_block(block) - elif block['type'] in [BlockType.Title] and len(block['lines']) == 1 and (block['bbox'][3] - block['bbox'][1]) > line_height * 2: - block['real_lines'] = copy.deepcopy(block['lines']) - add_lines_to_block(block) - else: - for line in block['lines']: - bbox = line['bbox'] - page_line_list.append(bbox) - elif block['type'] in [BlockType.ImageBody, BlockType.TableBody, BlockType.InterlineEquation]: - block['real_lines'] = copy.deepcopy(block['lines']) - add_lines_to_block(block) - - for block in footnote_blocks: - footnote_block = {'bbox': block[:4]} - add_lines_to_block(footnote_block) - - if len(page_line_list) > 200: # layoutreader最高支持512line - return None - - # 使用layoutreader排序 - x_scale = 1000.0 / page_w - y_scale = 1000.0 / page_h - boxes = [] - # logger.info(f"Scale: {x_scale}, {y_scale}, Boxes len: {len(page_line_list)}") - for left, top, right, bottom in page_line_list: - if left < 0: - logger.warning( - f'left < 0, left: {left}, right: {right}, top: {top}, bottom: {bottom}, page_w: {page_w}, page_h: {page_h}' - ) # noqa: E501 - left = 0 - if right > page_w: - logger.warning( - f'right > page_w, left: {left}, right: {right}, top: {top}, bottom: {bottom}, page_w: {page_w}, page_h: {page_h}' - ) # noqa: E501 - right = page_w - if top < 0: - logger.warning( - f'top < 0, left: {left}, right: {right}, top: {top}, bottom: {bottom}, page_w: {page_w}, page_h: {page_h}' - ) # noqa: E501 - top = 0 - if bottom > page_h: - logger.warning( - f'bottom > page_h, left: {left}, right: {right}, top: {top}, bottom: {bottom}, page_w: {page_w}, page_h: {page_h}' - ) # noqa: E501 - bottom = page_h - - left = round(left * x_scale) - top = round(top * y_scale) - right = round(right * x_scale) - bottom = round(bottom * y_scale) - assert ( - 1000 >= right >= left >= 0 and 1000 >= bottom >= top >= 0 - ), f'Invalid box. right: {right}, left: {left}, bottom: {bottom}, top: {top}' # noqa: E126, E121 - boxes.append([left, top, right, bottom]) - model_manager = ModelSingleton() - model = model_manager.get_model('layoutreader') - with torch.no_grad(): - orders = do_predict(boxes, model) - sorted_bboxes = [page_line_list[i] for i in orders] - - return sorted_bboxes - - -def get_line_height(blocks): - page_line_height_list = [] - for block in blocks: - if block['type'] in [ - BlockType.Text, BlockType.Title, - BlockType.ImageCaption, BlockType.ImageFootnote, - BlockType.TableCaption, BlockType.TableFootnote - ]: - for line in block['lines']: - bbox = line['bbox'] - page_line_height_list.append(int(bbox[3] - bbox[1])) - if len(page_line_height_list) > 0: - return statistics.median(page_line_height_list) - else: - return 10 - - -def process_groups(groups, body_key, caption_key, footnote_key): - body_blocks = [] - caption_blocks = [] - footnote_blocks = [] - for i, group in enumerate(groups): - group[body_key]['group_id'] = i - body_blocks.append(group[body_key]) - for caption_block in group[caption_key]: - caption_block['group_id'] = i - caption_blocks.append(caption_block) - for footnote_block in group[footnote_key]: - footnote_block['group_id'] = i - footnote_blocks.append(footnote_block) - return body_blocks, caption_blocks, footnote_blocks - - -def process_block_list(blocks, body_type, block_type): - indices = [block['index'] for block in blocks] - median_index = statistics.median(indices) - - body_bbox = next((block['bbox'] for block in blocks if block.get('type') == body_type), []) - - return { - 'type': block_type, - 'bbox': body_bbox, - 'blocks': blocks, - 'index': median_index, - } - - -def revert_group_blocks(blocks): - image_groups = {} - table_groups = {} - new_blocks = [] - for block in blocks: - if block['type'] in [BlockType.ImageBody, BlockType.ImageCaption, BlockType.ImageFootnote]: - group_id = block['group_id'] - if group_id not in image_groups: - image_groups[group_id] = [] - image_groups[group_id].append(block) - elif block['type'] in [BlockType.TableBody, BlockType.TableCaption, BlockType.TableFootnote]: - group_id = block['group_id'] - if group_id not in table_groups: - table_groups[group_id] = [] - table_groups[group_id].append(block) - else: - new_blocks.append(block) - - for group_id, blocks in image_groups.items(): - new_blocks.append(process_block_list(blocks, BlockType.ImageBody, BlockType.Image)) - - for group_id, blocks in table_groups.items(): - new_blocks.append(process_block_list(blocks, BlockType.TableBody, BlockType.Table)) - - return new_blocks - - -def remove_outside_spans(spans, all_bboxes, all_discarded_blocks): - def get_block_bboxes(blocks, block_type_list): - return [block[0:4] for block in blocks if block[7] in block_type_list] - - image_bboxes = get_block_bboxes(all_bboxes, [BlockType.ImageBody]) - table_bboxes = get_block_bboxes(all_bboxes, [BlockType.TableBody]) - other_block_type = [] - for block_type in BlockType.__dict__.values(): - if not isinstance(block_type, str): - continue - if block_type not in [BlockType.ImageBody, BlockType.TableBody]: - other_block_type.append(block_type) - other_block_bboxes = get_block_bboxes(all_bboxes, other_block_type) - discarded_block_bboxes = get_block_bboxes(all_discarded_blocks, [BlockType.Discarded]) - - new_spans = [] - - for span in spans: - span_bbox = span['bbox'] - span_type = span['type'] - - if any(calculate_overlap_area_in_bbox1_area_ratio(span_bbox, block_bbox) > 0.4 for block_bbox in - discarded_block_bboxes): - new_spans.append(span) - continue - - if span_type == ContentType.Image: - if any(calculate_overlap_area_in_bbox1_area_ratio(span_bbox, block_bbox) > 0.5 for block_bbox in - image_bboxes): - new_spans.append(span) - elif span_type == ContentType.Table: - if any(calculate_overlap_area_in_bbox1_area_ratio(span_bbox, block_bbox) > 0.5 for block_bbox in - table_bboxes): - new_spans.append(span) - else: - if any(calculate_overlap_area_in_bbox1_area_ratio(span_bbox, block_bbox) > 0.5 for block_bbox in - other_block_bboxes): - new_spans.append(span) - - return new_spans - - -def parse_page_core( - page_doc: PageableData, magic_model, page_id, pdf_bytes_md5, imageWriter, parse_mode, lang -): - need_drop = False - drop_reason = [] - - """从magic_model对象中获取后面会用到的区块信息""" - img_groups = magic_model.get_imgs_v2(page_id) - table_groups = magic_model.get_tables_v2(page_id) - - """对image和table的区块分组""" - img_body_blocks, img_caption_blocks, img_footnote_blocks = process_groups( - img_groups, 'image_body', 'image_caption_list', 'image_footnote_list' - ) - - table_body_blocks, table_caption_blocks, table_footnote_blocks = process_groups( - table_groups, 'table_body', 'table_caption_list', 'table_footnote_list' - ) - - discarded_blocks = magic_model.get_discarded(page_id) - text_blocks = magic_model.get_text_blocks(page_id) - title_blocks = magic_model.get_title_blocks(page_id) - inline_equations, interline_equations, interline_equation_blocks = magic_model.get_equations(page_id) - page_w, page_h = magic_model.get_page_size(page_id) - - def merge_title_blocks(blocks, x_distance_threshold=0.1*page_w): - def merge_two_bbox(b1, b2): - x_min = min(b1['bbox'][0], b2['bbox'][0]) - y_min = min(b1['bbox'][1], b2['bbox'][1]) - x_max = max(b1['bbox'][2], b2['bbox'][2]) - y_max = max(b1['bbox'][3], b2['bbox'][3]) - return x_min, y_min, x_max, y_max - - def merge_two_blocks(b1, b2): - # 合并两个标题块的边界框 - b1['bbox'] = merge_two_bbox(b1, b2) - - # 合并两个标题块的文本内容 - line1 = b1['lines'][0] - line2 = b2['lines'][0] - line1['bbox'] = merge_two_bbox(line1, line2) - line1['spans'].extend(line2['spans']) - - return b1, b2 - - # 按 y 轴重叠度聚集标题块 - y_overlapping_blocks = [] - title_bs = [b for b in blocks if b['type'] == BlockType.Title] - while title_bs: - block1 = title_bs.pop(0) - current_row = [block1] - to_remove = [] - for block2 in title_bs: - if ( - __is_overlaps_y_exceeds_threshold(block1['bbox'], block2['bbox'], 0.9) - and len(block1['lines']) == 1 - and len(block2['lines']) == 1 - ): - current_row.append(block2) - to_remove.append(block2) - for b in to_remove: - title_bs.remove(b) - y_overlapping_blocks.append(current_row) - - # 按x轴坐标排序并合并标题块 - to_remove_blocks = [] - for row in y_overlapping_blocks: - if len(row) == 1: - continue - - # 按x轴坐标排序 - row.sort(key=lambda x: x['bbox'][0]) - - merged_block = row[0] - for i in range(1, len(row)): - left_block = merged_block - right_block = row[i] - - left_height = left_block['bbox'][3] - left_block['bbox'][1] - right_height = right_block['bbox'][3] - right_block['bbox'][1] - - if ( - right_block['bbox'][0] - left_block['bbox'][2] < x_distance_threshold - and left_height * 0.95 < right_height < left_height * 1.05 - ): - merged_block, to_remove_block = merge_two_blocks(merged_block, right_block) - to_remove_blocks.append(to_remove_block) - else: - merged_block = right_block - - for b in to_remove_blocks: - blocks.remove(b) - - """将所有区块的bbox整理到一起""" - # interline_equation_blocks参数不够准,后面切换到interline_equations上 - interline_equation_blocks = [] - if len(interline_equation_blocks) > 0: - all_bboxes, all_discarded_blocks, footnote_blocks = ocr_prepare_bboxes_for_layout_split_v2( - img_body_blocks, img_caption_blocks, img_footnote_blocks, - table_body_blocks, table_caption_blocks, table_footnote_blocks, - discarded_blocks, - text_blocks, - title_blocks, - interline_equation_blocks, - page_w, - page_h, - ) - else: - all_bboxes, all_discarded_blocks, footnote_blocks = ocr_prepare_bboxes_for_layout_split_v2( - img_body_blocks, img_caption_blocks, img_footnote_blocks, - table_body_blocks, table_caption_blocks, table_footnote_blocks, - discarded_blocks, - text_blocks, - title_blocks, - interline_equations, - page_w, - page_h, - ) - - """获取所有的spans信息""" - spans = magic_model.get_all_spans(page_id) - - """在删除重复span之前,应该通过image_body和table_body的block过滤一下image和table的span""" - """顺便删除大水印并保留abandon的span""" - spans = remove_outside_spans(spans, all_bboxes, all_discarded_blocks) - - """删除重叠spans中置信度较低的那些""" - spans, dropped_spans_by_confidence = remove_overlaps_low_confidence_spans(spans) - """删除重叠spans中较小的那些""" - spans, dropped_spans_by_span_overlap = remove_overlaps_min_spans(spans) - - """根据parse_mode,构造spans,主要是文本类的字符填充""" - if parse_mode == SupportedPdfParseMethod.TXT: - - """使用新版本的混合ocr方案.""" - spans = txt_spans_extract_v2(page_doc, spans, all_bboxes, all_discarded_blocks, lang) - - elif parse_mode == SupportedPdfParseMethod.OCR: - pass - else: - raise Exception('parse_mode must be txt or ocr') - - """先处理不需要排版的discarded_blocks""" - discarded_block_with_spans, spans = fill_spans_in_blocks( - all_discarded_blocks, spans, 0.4 - ) - fix_discarded_blocks = fix_discarded_block(discarded_block_with_spans) - - """如果当前页面没有有效的bbox则跳过""" - if len(all_bboxes) == 0: - logger.warning(f'skip this page, not found useful bbox, page_id: {page_id}') - return ocr_construct_page_component_v2( - [], - [], - page_id, - page_w, - page_h, - [], - [], - [], - interline_equations, - fix_discarded_blocks, - need_drop, - drop_reason, - ) - - """对image和table截图""" - spans = ocr_cut_image_and_table( - spans, page_doc, page_id, pdf_bytes_md5, imageWriter - ) - - """span填充进block""" - block_with_spans, spans = fill_spans_in_blocks(all_bboxes, spans, 0.5) - - """对block进行fix操作""" - fix_blocks = fix_block_spans_v2(block_with_spans) - - """同一行被断开的titile合并""" - merge_title_blocks(fix_blocks) - - """获取所有line并计算正文line的高度""" - line_height = get_line_height(fix_blocks) - - """获取所有line并对line排序""" - sorted_bboxes = sort_lines_by_model(fix_blocks, page_w, page_h, line_height, footnote_blocks) - - """根据line的中位数算block的序列关系""" - fix_blocks = cal_block_index(fix_blocks, sorted_bboxes) - - """将image和table的block还原回group形式参与后续流程""" - fix_blocks = revert_group_blocks(fix_blocks) - - """重排block""" - sorted_blocks = sorted(fix_blocks, key=lambda b: b['index']) - - """block内重排(img和table的block内多个caption或footnote的排序)""" - for block in sorted_blocks: - if block['type'] in [BlockType.Image, BlockType.Table]: - block['blocks'] = sorted(block['blocks'], key=lambda b: b['index']) - - """获取QA需要外置的list""" - images, tables, interline_equations = get_qa_need_list_v2(sorted_blocks) - - """构造pdf_info_dict""" - page_info = ocr_construct_page_component_v2( - sorted_blocks, - [], - page_id, - page_w, - page_h, - [], - images, - tables, - interline_equations, - fix_discarded_blocks, - need_drop, - drop_reason, - ) - return page_info - - -def pdf_parse_union( - model_list, - dataset: Dataset, - imageWriter, - parse_mode, - start_page_id=0, - end_page_id=None, - debug_mode=False, - lang=None, -): - - pdf_bytes_md5 = compute_md5(dataset.data_bits()) - - """初始化空的pdf_info_dict""" - pdf_info_dict = {} - - """用model_list和docs对象初始化magic_model""" - magic_model = MagicModel(model_list, dataset) - - """根据输入的起始范围解析pdf""" - end_page_id = ( - end_page_id - if end_page_id is not None and end_page_id >= 0 - else len(dataset) - 1 - ) - - if end_page_id > len(dataset) - 1: - logger.warning('end_page_id is out of range, use pdf_docs length') - end_page_id = len(dataset) - 1 - - # """初始化启动时间""" - # start_time = time.time() - - # for page_id, page in enumerate(dataset): - for page_id, page in tqdm(enumerate(dataset), total=len(dataset), desc="Processing pages"): - # """debug时输出每页解析的耗时.""" - # if debug_mode: - # time_now = time.time() - # logger.info( - # f'page_id: {page_id}, last_page_cost_time: {round(time.time() - start_time, 2)}' - # ) - # start_time = time_now - - """解析pdf中的每一页""" - if start_page_id <= page_id <= end_page_id: - page_info = parse_page_core( - page, magic_model, page_id, pdf_bytes_md5, imageWriter, parse_mode, lang - ) - else: - page_info = page.get_page_info() - page_w = page_info.w - page_h = page_info.h - page_info = ocr_construct_page_component_v2( - [], [], page_id, page_w, page_h, [], [], [], [], [], True, 'skip page' - ) - pdf_info_dict[f'page_{page_id}'] = page_info - - need_ocr_list = [] - img_crop_list = [] - text_block_list = [] - for pange_id, page_info in pdf_info_dict.items(): - for block in page_info['preproc_blocks']: - if block['type'] in ['table', 'image']: - for sub_block in block['blocks']: - if sub_block['type'] in ['image_caption', 'image_footnote', 'table_caption', 'table_footnote']: - text_block_list.append(sub_block) - elif block['type'] in ['text', 'title']: - text_block_list.append(block) - for block in page_info['discarded_blocks']: - text_block_list.append(block) - for block in text_block_list: - for line in block['lines']: - for span in line['spans']: - if 'np_img' in span: - need_ocr_list.append(span) - img_crop_list.append(span['np_img']) - span.pop('np_img') - if len(img_crop_list) > 0: - # Get OCR results for this language's images - atom_model_manager = AtomModelSingleton() - ocr_model = atom_model_manager.get_atom_model( - atom_model_name='ocr', - ocr_show_log=False, - det_db_box_thresh=0.3, - lang=lang - ) - # rec_start = time.time() - ocr_res_list = ocr_model.ocr(img_crop_list, det=False, tqdm_enable=True)[0] - # Verify we have matching counts - assert len(ocr_res_list) == len(need_ocr_list), f'ocr_res_list: {len(ocr_res_list)}, need_ocr_list: {len(need_ocr_list)}' - # Process OCR results for this language - for index, span in enumerate(need_ocr_list): - ocr_text, ocr_score = ocr_res_list[index] - span['content'] = ocr_text - span['score'] = float(f"{ocr_score:.3f}") - # rec_time = time.time() - rec_start - # logger.info(f'ocr-dynamic-rec time: {round(rec_time, 2)}, total images processed: {len(img_crop_list)}') - - - """分段""" - para_split(pdf_info_dict) - - """llm优化""" - llm_aided_config = get_llm_aided_config() - if llm_aided_config is not None: - """公式优化""" - formula_aided_config = llm_aided_config.get('formula_aided', None) - if formula_aided_config is not None: - if formula_aided_config.get('enable', False): - llm_aided_formula_start_time = time.time() - llm_aided_formula(pdf_info_dict, formula_aided_config) - logger.info(f'llm aided formula time: {round(time.time() - llm_aided_formula_start_time, 2)}') - """文本优化""" - text_aided_config = llm_aided_config.get('text_aided', None) - if text_aided_config is not None: - if text_aided_config.get('enable', False): - llm_aided_text_start_time = time.time() - llm_aided_text(pdf_info_dict, text_aided_config) - logger.info(f'llm aided text time: {round(time.time() - llm_aided_text_start_time, 2)}') - """标题优化""" - title_aided_config = llm_aided_config.get('title_aided', None) - if title_aided_config is not None: - if title_aided_config.get('enable', False): - llm_aided_title_start_time = time.time() - llm_aided_title(pdf_info_dict, title_aided_config) - logger.info(f'llm aided title time: {round(time.time() - llm_aided_title_start_time, 2)}') - - """dict转list""" - pdf_info_list = dict_to_list(pdf_info_dict) - new_pdf_info_dict = { - 'pdf_info': pdf_info_list, - } - - clean_memory(get_device()) - - return new_pdf_info_dict - - -if __name__ == '__main__': - pass diff --git a/magic_pdf/post_proc/llm_aided.py b/magic_pdf/post_proc/llm_aided.py deleted file mode 100644 index c37481b3298a654b4596383e565f4514734d2dec..0000000000000000000000000000000000000000 --- a/magic_pdf/post_proc/llm_aided.py +++ /dev/null @@ -1,164 +0,0 @@ -# Copyright (c) Opendatalab. All rights reserved. -import json -from loguru import logger -from magic_pdf.dict2md.ocr_mkcontent import merge_para_with_text -from openai import OpenAI -import ast - - -#@todo: 有的公式以"\"结尾,这样会导致尾部拼接的"$"被转义,也需要修复 -formula_optimize_prompt = """请根据以下指南修正LaTeX公式的错误,确保公式能够渲染且符合原始内容: - -1. 修正渲染或编译错误: - - Some syntax errors such as mismatched/missing/extra tokens. Your task is to fix these syntax errors and make sure corrected results conform to latex math syntax principles. - - 包含KaTeX不支持的关键词等原因导致的无法编译或渲染的错误 - -2. 保留原始信息: - - 保留原始公式中的所有重要信息 - - 不要添加任何原始公式中没有的新信息 - -IMPORTANT:请仅返回修正后的公式,不要包含任何介绍、解释或元数据。 - -LaTeX recognition result: -$FORMULA - -Your corrected result: -""" - -text_optimize_prompt = f"""请根据以下指南修正OCR引起的错误,确保文本连贯并符合原始内容: - -1. 修正OCR引起的拼写错误和错误: - - 修正常见的OCR错误(例如,'rn' 被误读为 'm') - - 使用上下文和常识进行修正 - - 只修正明显的错误,不要不必要的修改内容 - - 不要添加额外的句号或其他不必要的标点符号 - -2. 保持原始结构: - - 保留所有标题和子标题 - -3. 保留原始内容: - - 保留原始文本中的所有重要信息 - - 不要添加任何原始文本中没有的新信息 - - 保留段落之间的换行符 - -4. 保持连贯性: - - 确保内容与前文顺畅连接 - - 适当处理在句子中间开始或结束的文本 - -5. 修正行内公式: - - 去除行内公式前后多余的空格 - - 修正公式中的OCR错误 - - 确保公式能够通过KaTeX渲染 - -6. 修正全角字符 - - 修正全角标点符号为半角标点符号 - - 修正全角字母为半角字母 - - 修正全角数字为半角数字 - -IMPORTANT:请仅返回修正后的文本,保留所有原始格式,包括换行符。不要包含任何介绍、解释或元数据。 - -Previous context: - -Current chunk to process: - -Corrected text: -""" - -def llm_aided_formula(pdf_info_dict, formula_aided_config): - pass - -def llm_aided_text(pdf_info_dict, text_aided_config): - pass - -def llm_aided_title(pdf_info_dict, title_aided_config): - client = OpenAI( - api_key=title_aided_config["api_key"], - base_url=title_aided_config["base_url"], - ) - title_dict = {} - origin_title_list = [] - i = 0 - for page_num, page in pdf_info_dict.items(): - blocks = page["para_blocks"] - for block in blocks: - if block["type"] == "title": - origin_title_list.append(block) - title_text = merge_para_with_text(block) - page_line_height_list = [] - for line in block['lines']: - bbox = line['bbox'] - page_line_height_list.append(int(bbox[3] - bbox[1])) - if len(page_line_height_list) > 0: - line_avg_height = sum(page_line_height_list) / len(page_line_height_list) - else: - line_avg_height = int(block['bbox'][3] - block['bbox'][1]) - title_dict[f"{i}"] = [title_text, line_avg_height, int(page_num[5:])+1] - i += 1 - # logger.info(f"Title list: {title_dict}") - - title_optimize_prompt = f"""输入的内容是一篇文档中所有标题组成的字典,请根据以下指南优化标题的结果,使结果符合正常文档的层次结构: - -1. 字典中每个value均为一个list,包含以下元素: - - 标题文本 - - 文本行高是标题所在块的平均行高 - - 标题所在的页码 - -2. 保留原始内容: - - 输入的字典中所有元素都是有效的,不能删除字典中的任何元素 - - 请务必保证输出的字典中元素的数量和输入的数量一致 - -3. 保持字典内key-value的对应关系不变 - -4. 优化层次结构: - - 为每个标题元素添加适当的层次结构 - - 行高较大的标题一般是更高级别的标题 - - 标题从前至后的层级必须是连续的,不能跳过层级 - - 标题层级最多为4级,不要添加过多的层级 - - 优化后的标题只保留代表该标题的层级的整数,不要保留其他信息 - -5. 合理性检查与微调: - - 在完成初步分级后,仔细检查分级结果的合理性 - - 根据上下文关系和逻辑顺序,对不合理的分级进行微调 - - 确保最终的分级结果符合文档的实际结构和逻辑 - - 字典中可能包含被误当成标题的正文,你可以通过将其层级标记为 0 来排除它们 - -IMPORTANT: -请直接返回优化过的由标题层级组成的字典,格式为{{标题id:标题层级}},如下: -{{0:1,1:2,2:2,3:3}} -不需要对字典格式化,不需要返回任何其他信息。 - -Input title list: -{title_dict} - -Corrected title list: -""" - - retry_count = 0 - max_retries = 3 - dict_completion = None - - while retry_count < max_retries: - try: - completion = client.chat.completions.create( - model=title_aided_config["model"], - messages=[ - {'role': 'user', 'content': title_optimize_prompt}], - temperature=0.7, - ) - # logger.info(f"Title completion: {completion.choices[0].message.content}") - dict_completion = ast.literal_eval(completion.choices[0].message.content) - # logger.info(f"len(dict_completion): {len(dict_completion)}, len(title_dict): {len(title_dict)}") - - if len(dict_completion) == len(title_dict): - for i, origin_title_block in enumerate(origin_title_list): - origin_title_block["level"] = int(dict_completion[i]) - break - else: - logger.warning("The number of titles in the optimized result is not equal to the number of titles in the input.") - retry_count += 1 - except Exception as e: - logger.exception(e) - retry_count += 1 - - if dict_completion is None: - logger.error("Failed to decode dict after maximum retries.") diff --git a/magic_pdf/post_proc/para_split_v3.py b/magic_pdf/post_proc/para_split_v3.py deleted file mode 100644 index 5f6852a69c40bbf3e6bd2d42c2c0218ce88ab280..0000000000000000000000000000000000000000 --- a/magic_pdf/post_proc/para_split_v3.py +++ /dev/null @@ -1,394 +0,0 @@ -import copy - -from loguru import logger - -from magic_pdf.config.constants import CROSS_PAGE, LINES_DELETED -from magic_pdf.config.ocr_content_type import BlockType, ContentType -from magic_pdf.libs.language import detect_lang - -LINE_STOP_FLAG = ( - '.', - '!', - '?', - '。', - '!', - '?', - ')', - ')', - '"', - '”', - ':', - ':', - ';', - ';', -) -LIST_END_FLAG = ('.', '。', ';', ';') - - -class ListLineTag: - IS_LIST_START_LINE = 'is_list_start_line' - IS_LIST_END_LINE = 'is_list_end_line' - - -def __process_blocks(blocks): - # 对所有block预处理 - # 1.通过title和interline_equation将block分组 - # 2.bbox边界根据line信息重置 - - result = [] - current_group = [] - - for i in range(len(blocks)): - current_block = blocks[i] - - # 如果当前块是 text 类型 - if current_block['type'] == 'text': - current_block['bbox_fs'] = copy.deepcopy(current_block['bbox']) - if 'lines' in current_block and len(current_block['lines']) > 0: - current_block['bbox_fs'] = [ - min([line['bbox'][0] for line in current_block['lines']]), - min([line['bbox'][1] for line in current_block['lines']]), - max([line['bbox'][2] for line in current_block['lines']]), - max([line['bbox'][3] for line in current_block['lines']]), - ] - current_group.append(current_block) - - # 检查下一个块是否存在 - if i + 1 < len(blocks): - next_block = blocks[i + 1] - # 如果下一个块不是 text 类型且是 title 或 interline_equation 类型 - if next_block['type'] in ['title', 'interline_equation']: - result.append(current_group) - current_group = [] - - # 处理最后一个 group - if current_group: - result.append(current_group) - - return result - - -def __is_list_or_index_block(block): - # 一个block如果是list block 应该同时满足以下特征 - # 1.block内有多个line 2.block 内有多个line左侧顶格写 3.block内有多个line 右侧不顶格(狗牙状) - # 1.block内有多个line 2.block 内有多个line左侧顶格写 3.多个line以endflag结尾 - # 1.block内有多个line 2.block 内有多个line左侧顶格写 3.block内有多个line 左侧不顶格 - - # index block 是一种特殊的list block - # 一个block如果是index block 应该同时满足以下特征 - # 1.block内有多个line 2.block 内有多个line两侧均顶格写 3.line的开头或者结尾均为数字 - if len(block['lines']) >= 2: - first_line = block['lines'][0] - line_height = first_line['bbox'][3] - first_line['bbox'][1] - block_weight = block['bbox_fs'][2] - block['bbox_fs'][0] - block_height = block['bbox_fs'][3] - block['bbox_fs'][1] - page_weight, page_height = block['page_size'] - - left_close_num = 0 - left_not_close_num = 0 - right_not_close_num = 0 - right_close_num = 0 - lines_text_list = [] - center_close_num = 0 - external_sides_not_close_num = 0 - multiple_para_flag = False - last_line = block['lines'][-1] - - if page_weight == 0: - block_weight_radio = 0 - else: - block_weight_radio = block_weight / page_weight - # logger.info(f"block_weight_radio: {block_weight_radio}") - - # 如果首行左边不顶格而右边顶格,末行左边顶格而右边不顶格 (第一行可能可以右边不顶格) - if ( - first_line['bbox'][0] - block['bbox_fs'][0] > line_height / 2 - and abs(last_line['bbox'][0] - block['bbox_fs'][0]) < line_height / 2 - and block['bbox_fs'][2] - last_line['bbox'][2] > line_height - ): - multiple_para_flag = True - - block_text = '' - - for line in block['lines']: - line_text = '' - - for span in line['spans']: - span_type = span['type'] - if span_type == ContentType.Text: - line_text += span['content'].strip() - # 添加所有文本,包括空行,保持与block['lines']长度一致 - lines_text_list.append(line_text) - block_text = ''.join(lines_text_list) - - block_lang = detect_lang(block_text) - # logger.info(f"block_lang: {block_lang}") - - for line in block['lines']: - line_mid_x = (line['bbox'][0] + line['bbox'][2]) / 2 - block_mid_x = (block['bbox_fs'][0] + block['bbox_fs'][2]) / 2 - if ( - line['bbox'][0] - block['bbox_fs'][0] > 0.7 * line_height - and block['bbox_fs'][2] - line['bbox'][2] > 0.7 * line_height - ): - external_sides_not_close_num += 1 - if abs(line_mid_x - block_mid_x) < line_height / 2: - center_close_num += 1 - - # 计算line左侧顶格数量是否大于2,是否顶格用abs(block['bbox_fs'][0] - line['bbox'][0]) < line_height/2 来判断 - if abs(block['bbox_fs'][0] - line['bbox'][0]) < line_height / 2: - left_close_num += 1 - elif line['bbox'][0] - block['bbox_fs'][0] > line_height: - left_not_close_num += 1 - - # 计算右侧是否顶格 - if abs(block['bbox_fs'][2] - line['bbox'][2]) < line_height: - right_close_num += 1 - else: - # 类中文没有超长单词的情况,可以用统一的阈值 - if block_lang in ['zh', 'ja', 'ko']: - closed_area = 0.26 * block_weight - else: - # 右侧不顶格情况下是否有一段距离,拍脑袋用0.3block宽度做阈值 - # block宽的阈值可以小些,block窄的阈值要大 - if block_weight_radio >= 0.5: - closed_area = 0.26 * block_weight - else: - closed_area = 0.36 * block_weight - if block['bbox_fs'][2] - line['bbox'][2] > closed_area: - right_not_close_num += 1 - - # 判断lines_text_list中的元素是否有超过80%都以LIST_END_FLAG结尾 - line_end_flag = False - # 判断lines_text_list中的元素是否有超过80%都以数字开头或都以数字结尾 - line_num_flag = False - num_start_count = 0 - num_end_count = 0 - flag_end_count = 0 - - if len(lines_text_list) > 0: - for line_text in lines_text_list: - if len(line_text) > 0: - if line_text[-1] in LIST_END_FLAG: - flag_end_count += 1 - if line_text[0].isdigit(): - num_start_count += 1 - if line_text[-1].isdigit(): - num_end_count += 1 - - if ( - num_start_count / len(lines_text_list) >= 0.8 - or num_end_count / len(lines_text_list) >= 0.8 - ): - line_num_flag = True - if flag_end_count / len(lines_text_list) >= 0.8: - line_end_flag = True - - # 有的目录右侧不贴边, 目前认为左边或者右边有一边全贴边,且符合数字规则极为index - if ( - left_close_num / len(block['lines']) >= 0.8 - or right_close_num / len(block['lines']) >= 0.8 - ) and line_num_flag: - for line in block['lines']: - line[ListLineTag.IS_LIST_START_LINE] = True - return BlockType.Index - - # 全部line都居中的特殊list识别,每行都需要换行,特征是多行,且大多数行都前后not_close,每line中点x坐标接近 - # 补充条件block的长宽比有要求 - elif ( - external_sides_not_close_num >= 2 - and center_close_num == len(block['lines']) - and external_sides_not_close_num / len(block['lines']) >= 0.5 - and block_height / block_weight > 0.4 - ): - for line in block['lines']: - line[ListLineTag.IS_LIST_START_LINE] = True - return BlockType.List - - elif ( - left_close_num >= 2 - and (right_not_close_num >= 2 or line_end_flag or left_not_close_num >= 2) - and not multiple_para_flag - # and block_weight_radio > 0.27 - ): - # 处理一种特殊的没有缩进的list,所有行都贴左边,通过右边的空隙判断是否是item尾 - if left_close_num / len(block['lines']) > 0.8: - # 这种是每个item只有一行,且左边都贴边的短item list - if flag_end_count == 0 and right_close_num / len(block['lines']) < 0.5: - for line in block['lines']: - if abs(block['bbox_fs'][0] - line['bbox'][0]) < line_height / 2: - line[ListLineTag.IS_LIST_START_LINE] = True - # 这种是大部分line item 都有结束标识符的情况,按结束标识符区分不同item - elif line_end_flag: - for i, line in enumerate(block['lines']): - if ( - len(lines_text_list[i]) > 0 - and lines_text_list[i][-1] in LIST_END_FLAG - ): - line[ListLineTag.IS_LIST_END_LINE] = True - if i + 1 < len(block['lines']): - block['lines'][i + 1][ - ListLineTag.IS_LIST_START_LINE - ] = True - # line item基本没有结束标识符,而且也没有缩进,按右侧空隙判断哪些是item end - else: - line_start_flag = False - for i, line in enumerate(block['lines']): - if line_start_flag: - line[ListLineTag.IS_LIST_START_LINE] = True - line_start_flag = False - - if ( - abs(block['bbox_fs'][2] - line['bbox'][2]) - > 0.1 * block_weight - ): - line[ListLineTag.IS_LIST_END_LINE] = True - line_start_flag = True - # 一种有缩进的特殊有序list,start line 左侧不贴边且以数字开头,end line 以 IS_LIST_END_FLAG 结尾且数量和start line 一致 - elif num_start_count >= 2 and num_start_count == flag_end_count: - for i, line in enumerate(block['lines']): - if len(lines_text_list[i]) > 0: - if lines_text_list[i][0].isdigit(): - line[ListLineTag.IS_LIST_START_LINE] = True - if lines_text_list[i][-1] in LIST_END_FLAG: - line[ListLineTag.IS_LIST_END_LINE] = True - else: - # 正常有缩进的list处理 - for line in block['lines']: - if abs(block['bbox_fs'][0] - line['bbox'][0]) < line_height / 2: - line[ListLineTag.IS_LIST_START_LINE] = True - if abs(block['bbox_fs'][2] - line['bbox'][2]) > line_height: - line[ListLineTag.IS_LIST_END_LINE] = True - - return BlockType.List - else: - return BlockType.Text - else: - return BlockType.Text - - -def __merge_2_text_blocks(block1, block2): - if len(block1['lines']) > 0: - first_line = block1['lines'][0] - line_height = first_line['bbox'][3] - first_line['bbox'][1] - block1_weight = block1['bbox'][2] - block1['bbox'][0] - block2_weight = block2['bbox'][2] - block2['bbox'][0] - min_block_weight = min(block1_weight, block2_weight) - if abs(block1['bbox_fs'][0] - first_line['bbox'][0]) < line_height / 2: - last_line = block2['lines'][-1] - if len(last_line['spans']) > 0: - last_span = last_line['spans'][-1] - line_height = last_line['bbox'][3] - last_line['bbox'][1] - if len(first_line['spans']) > 0: - first_span = first_line['spans'][0] - if len(first_span['content']) > 0: - span_start_with_num = first_span['content'][0].isdigit() - span_start_with_big_char = first_span['content'][0].isupper() - if ( - # 上一个block的最后一个line的右边界和block的右边界差距不超过line_height - abs(block2['bbox_fs'][2] - last_line['bbox'][2]) < line_height - # 上一个block的最后一个span不是以特定符号结尾 - and not last_span['content'].endswith(LINE_STOP_FLAG) - # 两个block宽度差距超过2倍也不合并 - and abs(block1_weight - block2_weight) < min_block_weight - # 下一个block的第一个字符是数字 - and not span_start_with_num - # 下一个block的第一个字符是大写字母 - and not span_start_with_big_char - ): - if block1['page_num'] != block2['page_num']: - for line in block1['lines']: - for span in line['spans']: - span[CROSS_PAGE] = True - block2['lines'].extend(block1['lines']) - block1['lines'] = [] - block1[LINES_DELETED] = True - - return block1, block2 - - -def __merge_2_list_blocks(block1, block2): - if block1['page_num'] != block2['page_num']: - for line in block1['lines']: - for span in line['spans']: - span[CROSS_PAGE] = True - block2['lines'].extend(block1['lines']) - block1['lines'] = [] - block1[LINES_DELETED] = True - - return block1, block2 - - -def __is_list_group(text_blocks_group): - # list group的特征是一个group内的所有block都满足以下条件 - # 1.每个block都不超过3行 2. 每个block 的左边界都比较接近(逻辑简单点先不加这个规则) - for block in text_blocks_group: - if len(block['lines']) > 3: - return False - return True - - -def __para_merge_page(blocks): - page_text_blocks_groups = __process_blocks(blocks) - for text_blocks_group in page_text_blocks_groups: - if len(text_blocks_group) > 0: - # 需要先在合并前对所有block判断是否为list or index block - for block in text_blocks_group: - block_type = __is_list_or_index_block(block) - block['type'] = block_type - # logger.info(f"{block['type']}:{block}") - - if len(text_blocks_group) > 1: - # 在合并前判断这个group 是否是一个 list group - is_list_group = __is_list_group(text_blocks_group) - - # 倒序遍历 - for i in range(len(text_blocks_group) - 1, -1, -1): - current_block = text_blocks_group[i] - - # 检查是否有前一个块 - if i - 1 >= 0: - prev_block = text_blocks_group[i - 1] - - if ( - current_block['type'] == 'text' - and prev_block['type'] == 'text' - and not is_list_group - ): - __merge_2_text_blocks(current_block, prev_block) - elif ( - current_block['type'] == BlockType.List - and prev_block['type'] == BlockType.List - ) or ( - current_block['type'] == BlockType.Index - and prev_block['type'] == BlockType.Index - ): - __merge_2_list_blocks(current_block, prev_block) - - else: - continue - - -def para_split(pdf_info_dict): - all_blocks = [] - for page_num, page in pdf_info_dict.items(): - blocks = copy.deepcopy(page['preproc_blocks']) - for block in blocks: - block['page_num'] = page_num - block['page_size'] = page['page_size'] - all_blocks.extend(blocks) - - __para_merge_page(all_blocks) - for page_num, page in pdf_info_dict.items(): - page['para_blocks'] = [] - for block in all_blocks: - if block['page_num'] == page_num: - page['para_blocks'].append(block) - - -if __name__ == '__main__': - input_blocks = [] - # 调用函数 - groups = __process_blocks(input_blocks) - for group_index, group in enumerate(groups): - print(f'Group {group_index}: {group}') diff --git a/magic_pdf/pre_proc/__init__.py b/magic_pdf/pre_proc/__init__.py deleted file mode 100644 index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000 diff --git a/magic_pdf/pre_proc/construct_page_dict.py b/magic_pdf/pre_proc/construct_page_dict.py deleted file mode 100644 index 09c09c137bc75c869c4d6f58594bb713c6944ec8..0000000000000000000000000000000000000000 --- a/magic_pdf/pre_proc/construct_page_dict.py +++ /dev/null @@ -1,17 +0,0 @@ - -def ocr_construct_page_component_v2(blocks, layout_bboxes, page_id, page_w, page_h, layout_tree, - images, tables, interline_equations, discarded_blocks, need_drop, drop_reason): - return_dict = { - 'preproc_blocks': blocks, - 'layout_bboxes': layout_bboxes, - 'page_idx': page_id, - 'page_size': [page_w, page_h], - '_layout_tree': layout_tree, - 'images': images, - 'tables': tables, - 'interline_equations': interline_equations, - 'discarded_blocks': discarded_blocks, - 'need_drop': need_drop, - 'drop_reason': drop_reason, - } - return return_dict diff --git a/magic_pdf/pre_proc/cut_image.py b/magic_pdf/pre_proc/cut_image.py deleted file mode 100644 index 901d372ec9ee6a5c6d9cdd0e25cf3b683f9179a9..0000000000000000000000000000000000000000 --- a/magic_pdf/pre_proc/cut_image.py +++ /dev/null @@ -1,32 +0,0 @@ -from loguru import logger - -from magic_pdf.config.ocr_content_type import ContentType -from magic_pdf.libs.commons import join_path -from magic_pdf.libs.pdf_image_tools import cut_image - - -def ocr_cut_image_and_table(spans, page, page_id, pdf_bytes_md5, imageWriter): - def return_path(type): - return join_path(pdf_bytes_md5, type) - - for span in spans: - span_type = span['type'] - if span_type == ContentType.Image: - if not check_img_bbox(span['bbox']) or not imageWriter: - continue - span['image_path'] = cut_image(span['bbox'], page_id, page, return_path=return_path('images'), - imageWriter=imageWriter) - elif span_type == ContentType.Table: - if not check_img_bbox(span['bbox']) or not imageWriter: - continue - span['image_path'] = cut_image(span['bbox'], page_id, page, return_path=return_path('tables'), - imageWriter=imageWriter) - - return spans - - -def check_img_bbox(bbox) -> bool: - if any([bbox[0] >= bbox[2], bbox[1] >= bbox[3]]): - logger.warning(f'image_bboxes: 错误的box, {bbox}') - return False - return True diff --git a/magic_pdf/pre_proc/ocr_detect_all_bboxes.py b/magic_pdf/pre_proc/ocr_detect_all_bboxes.py deleted file mode 100644 index b9fd5b029f1647e59e3c5f603b936201a726ddc8..0000000000000000000000000000000000000000 --- a/magic_pdf/pre_proc/ocr_detect_all_bboxes.py +++ /dev/null @@ -1,257 +0,0 @@ -from magic_pdf.config.ocr_content_type import BlockType -from magic_pdf.libs.boxbase import ( - calculate_iou, - calculate_overlap_area_in_bbox1_area_ratio, - calculate_vertical_projection_overlap_ratio, - get_minbox_if_overlap_by_ratio -) -from magic_pdf.pre_proc.remove_bbox_overlap import remove_overlap_between_bbox_for_block - - -def add_bboxes(blocks, block_type, bboxes): - for block in blocks: - x0, y0, x1, y1 = block['bbox'] - if block_type in [ - BlockType.ImageBody, - BlockType.ImageCaption, - BlockType.ImageFootnote, - BlockType.TableBody, - BlockType.TableCaption, - BlockType.TableFootnote, - ]: - bboxes.append( - [ - x0, - y0, - x1, - y1, - None, - None, - None, - block_type, - None, - None, - None, - None, - block['score'], - block['group_id'], - ] - ) - else: - bboxes.append( - [ - x0, - y0, - x1, - y1, - None, - None, - None, - block_type, - None, - None, - None, - None, - block['score'], - ] - ) - - -def ocr_prepare_bboxes_for_layout_split_v2( - img_body_blocks, - img_caption_blocks, - img_footnote_blocks, - table_body_blocks, - table_caption_blocks, - table_footnote_blocks, - discarded_blocks, - text_blocks, - title_blocks, - interline_equation_blocks, - page_w, - page_h, -): - all_bboxes = [] - - add_bboxes(img_body_blocks, BlockType.ImageBody, all_bboxes) - add_bboxes(img_caption_blocks, BlockType.ImageCaption, all_bboxes) - add_bboxes(img_footnote_blocks, BlockType.ImageFootnote, all_bboxes) - add_bboxes(table_body_blocks, BlockType.TableBody, all_bboxes) - add_bboxes(table_caption_blocks, BlockType.TableCaption, all_bboxes) - add_bboxes(table_footnote_blocks, BlockType.TableFootnote, all_bboxes) - add_bboxes(text_blocks, BlockType.Text, all_bboxes) - add_bboxes(title_blocks, BlockType.Title, all_bboxes) - add_bboxes(interline_equation_blocks, BlockType.InterlineEquation, all_bboxes) - - """block嵌套问题解决""" - """文本框与标题框重叠,优先信任文本框""" - all_bboxes = fix_text_overlap_title_blocks(all_bboxes) - """任何框体与舍弃框重叠,优先信任舍弃框""" - all_bboxes = remove_need_drop_blocks(all_bboxes, discarded_blocks) - - # interline_equation 与title或text框冲突的情况,分两种情况处理 - """interline_equation框与文本类型框iou比较接近1的时候,信任行间公式框""" - all_bboxes = fix_interline_equation_overlap_text_blocks_with_hi_iou(all_bboxes) - """interline_equation框被包含在文本类型框内,且interline_equation比文本区块小很多时信任文本框,这时需要舍弃公式框""" - # 通过后续大框套小框逻辑删除 - - """discarded_blocks""" - all_discarded_blocks = [] - add_bboxes(discarded_blocks, BlockType.Discarded, all_discarded_blocks) - - """footnote识别:宽度超过1/3页面宽度的,高度超过10的,处于页面下半30%区域的""" - footnote_blocks = [] - for discarded in discarded_blocks: - x0, y0, x1, y1 = discarded['bbox'] - if (x1 - x0) > (page_w / 3) and (y1 - y0) > 10 and y0 > (page_h * 0.7): - footnote_blocks.append([x0, y0, x1, y1]) - - """移除在footnote下面的任何框""" - need_remove_blocks = find_blocks_under_footnote(all_bboxes, footnote_blocks) - if len(need_remove_blocks) > 0: - for block in need_remove_blocks: - all_bboxes.remove(block) - all_discarded_blocks.append(block) - - """经过以上处理后,还存在大框套小框的情况,则删除小框""" - all_bboxes = remove_overlaps_min_blocks(all_bboxes) - all_discarded_blocks = remove_overlaps_min_blocks(all_discarded_blocks) - """将剩余的bbox做分离处理,防止后面分layout时出错""" - # all_bboxes, drop_reasons = remove_overlap_between_bbox_for_block(all_bboxes) - all_bboxes.sort(key=lambda x: x[0]+x[1]) - return all_bboxes, all_discarded_blocks, footnote_blocks - - -def find_blocks_under_footnote(all_bboxes, footnote_blocks): - need_remove_blocks = [] - for block in all_bboxes: - block_x0, block_y0, block_x1, block_y1 = block[:4] - for footnote_bbox in footnote_blocks: - footnote_x0, footnote_y0, footnote_x1, footnote_y1 = footnote_bbox - # 如果footnote的纵向投影覆盖了block的纵向投影的80%且block的y0大于等于footnote的y1 - if ( - block_y0 >= footnote_y1 - and calculate_vertical_projection_overlap_ratio( - (block_x0, block_y0, block_x1, block_y1), footnote_bbox - ) - >= 0.8 - ): - if block not in need_remove_blocks: - need_remove_blocks.append(block) - break - return need_remove_blocks - - -def fix_interline_equation_overlap_text_blocks_with_hi_iou(all_bboxes): - # 先提取所有text和interline block - text_blocks = [] - for block in all_bboxes: - if block[7] == BlockType.Text: - text_blocks.append(block) - interline_equation_blocks = [] - for block in all_bboxes: - if block[7] == BlockType.InterlineEquation: - interline_equation_blocks.append(block) - - need_remove = [] - - for interline_equation_block in interline_equation_blocks: - for text_block in text_blocks: - interline_equation_block_bbox = interline_equation_block[:4] - text_block_bbox = text_block[:4] - if calculate_iou(interline_equation_block_bbox, text_block_bbox) > 0.8: - if text_block not in need_remove: - need_remove.append(text_block) - - if len(need_remove) > 0: - for block in need_remove: - all_bboxes.remove(block) - - return all_bboxes - - -def fix_text_overlap_title_blocks(all_bboxes): - # 先提取所有text和title block - text_blocks = [] - for block in all_bboxes: - if block[7] == BlockType.Text: - text_blocks.append(block) - title_blocks = [] - for block in all_bboxes: - if block[7] == BlockType.Title: - title_blocks.append(block) - - need_remove = [] - - for text_block in text_blocks: - for title_block in title_blocks: - text_block_bbox = text_block[:4] - title_block_bbox = title_block[:4] - if calculate_iou(text_block_bbox, title_block_bbox) > 0.8: - if title_block not in need_remove: - need_remove.append(title_block) - - if len(need_remove) > 0: - for block in need_remove: - all_bboxes.remove(block) - - return all_bboxes - - -def remove_need_drop_blocks(all_bboxes, discarded_blocks): - need_remove = [] - for block in all_bboxes: - for discarded_block in discarded_blocks: - block_bbox = block[:4] - if ( - calculate_overlap_area_in_bbox1_area_ratio( - block_bbox, discarded_block['bbox'] - ) - > 0.6 - ): - if block not in need_remove: - need_remove.append(block) - break - - if len(need_remove) > 0: - for block in need_remove: - all_bboxes.remove(block) - return all_bboxes - - -def remove_overlaps_min_blocks(all_bboxes): - # 重叠block,小的不能直接删除,需要和大的那个合并成一个更大的。 - # 删除重叠blocks中较小的那些 - need_remove = [] - for block1 in all_bboxes: - for block2 in all_bboxes: - if block1 != block2: - block1_bbox = block1[:4] - block2_bbox = block2[:4] - overlap_box = get_minbox_if_overlap_by_ratio( - block1_bbox, block2_bbox, 0.8 - ) - if overlap_box is not None: - block_to_remove = next( - (block for block in all_bboxes if block[:4] == overlap_box), - None, - ) - if ( - block_to_remove is not None - and block_to_remove not in need_remove - ): - large_block = block1 if block1 != block_to_remove else block2 - x1, y1, x2, y2 = large_block[:4] - sx1, sy1, sx2, sy2 = block_to_remove[:4] - x1 = min(x1, sx1) - y1 = min(y1, sy1) - x2 = max(x2, sx2) - y2 = max(y2, sy2) - large_block[:4] = [x1, y1, x2, y2] - need_remove.append(block_to_remove) - - if len(need_remove) > 0: - for block in need_remove: - all_bboxes.remove(block) - - return all_bboxes diff --git a/magic_pdf/pre_proc/ocr_dict_merge.py b/magic_pdf/pre_proc/ocr_dict_merge.py deleted file mode 100644 index 38ca3652b2d34e09e74b5bc2f8acbaddc11d4917..0000000000000000000000000000000000000000 --- a/magic_pdf/pre_proc/ocr_dict_merge.py +++ /dev/null @@ -1,159 +0,0 @@ -from magic_pdf.config.ocr_content_type import BlockType, ContentType -from magic_pdf.libs.boxbase import __is_overlaps_y_exceeds_threshold, calculate_overlap_area_in_bbox1_area_ratio - - -# 将每一个line中的span从左到右排序 -def line_sort_spans_by_left_to_right(lines): - line_objects = [] - for line in lines: - # 按照x0坐标排序 - line.sort(key=lambda span: span['bbox'][0]) - line_bbox = [ - min(span['bbox'][0] for span in line), # x0 - min(span['bbox'][1] for span in line), # y0 - max(span['bbox'][2] for span in line), # x1 - max(span['bbox'][3] for span in line), # y1 - ] - line_objects.append({ - 'bbox': line_bbox, - 'spans': line, - }) - return line_objects - - -def merge_spans_to_line(spans, threshold=0.6): - if len(spans) == 0: - return [] - else: - # 按照y0坐标排序 - spans.sort(key=lambda span: span['bbox'][1]) - - lines = [] - current_line = [spans[0]] - for span in spans[1:]: - # 如果当前的span类型为"interline_equation" 或者 当前行中已经有"interline_equation" - # image和table类型,同上 - if span['type'] in [ - ContentType.InterlineEquation, ContentType.Image, - ContentType.Table - ] or any(s['type'] in [ - ContentType.InterlineEquation, ContentType.Image, - ContentType.Table - ] for s in current_line): - # 则开始新行 - lines.append(current_line) - current_line = [span] - continue - - # 如果当前的span与当前行的最后一个span在y轴上重叠,则添加到当前行 - if __is_overlaps_y_exceeds_threshold(span['bbox'], current_line[-1]['bbox'], threshold): - current_line.append(span) - else: - # 否则,开始新行 - lines.append(current_line) - current_line = [span] - - # 添加最后一行 - if current_line: - lines.append(current_line) - - return lines - - -def span_block_type_compatible(span_type, block_type): - if span_type in [ContentType.Text, ContentType.InlineEquation]: - return block_type in [ - BlockType.Text, - BlockType.Title, - BlockType.ImageCaption, - BlockType.ImageFootnote, - BlockType.TableCaption, - BlockType.TableFootnote, - BlockType.Discarded - ] - elif span_type == ContentType.InterlineEquation: - return block_type in [BlockType.InterlineEquation, BlockType.Text] - elif span_type == ContentType.Image: - return block_type in [BlockType.ImageBody] - elif span_type == ContentType.Table: - return block_type in [BlockType.TableBody] - else: - return False - - -def fill_spans_in_blocks(blocks, spans, radio): - """将allspans中的span按位置关系,放入blocks中.""" - block_with_spans = [] - for block in blocks: - block_type = block[7] - block_bbox = block[0:4] - block_dict = { - 'type': block_type, - 'bbox': block_bbox, - } - if block_type in [ - BlockType.ImageBody, BlockType.ImageCaption, BlockType.ImageFootnote, - BlockType.TableBody, BlockType.TableCaption, BlockType.TableFootnote - ]: - block_dict['group_id'] = block[-1] - block_spans = [] - for span in spans: - span_bbox = span['bbox'] - if calculate_overlap_area_in_bbox1_area_ratio(span_bbox, block_bbox) > radio and span_block_type_compatible(span['type'], block_type): - block_spans.append(span) - - block_dict['spans'] = block_spans - block_with_spans.append(block_dict) - - # 从spans删除已经放入block_spans中的span - if len(block_spans) > 0: - for span in block_spans: - spans.remove(span) - - return block_with_spans, spans - - -def fix_block_spans_v2(block_with_spans): - fix_blocks = [] - for block in block_with_spans: - block_type = block['type'] - - if block_type in [BlockType.Text, BlockType.Title, - BlockType.ImageCaption, BlockType.ImageFootnote, - BlockType.TableCaption, BlockType.TableFootnote - ]: - block = fix_text_block(block) - elif block_type in [BlockType.InterlineEquation, BlockType.ImageBody, BlockType.TableBody]: - block = fix_interline_block(block) - else: - continue - fix_blocks.append(block) - return fix_blocks - - -def fix_discarded_block(discarded_block_with_spans): - fix_discarded_blocks = [] - for block in discarded_block_with_spans: - block = fix_text_block(block) - fix_discarded_blocks.append(block) - return fix_discarded_blocks - - -def fix_text_block(block): - # 文本block中的公式span都应该转换成行内type - for span in block['spans']: - if span['type'] == ContentType.InterlineEquation: - span['type'] = ContentType.InlineEquation - block_lines = merge_spans_to_line(block['spans']) - sort_block_lines = line_sort_spans_by_left_to_right(block_lines) - block['lines'] = sort_block_lines - del block['spans'] - return block - - -def fix_interline_block(block): - block_lines = merge_spans_to_line(block['spans']) - sort_block_lines = line_sort_spans_by_left_to_right(block_lines) - block['lines'] = sort_block_lines - del block['spans'] - return block diff --git a/magic_pdf/pre_proc/ocr_span_list_modify.py b/magic_pdf/pre_proc/ocr_span_list_modify.py deleted file mode 100644 index 3fa9d2dd556c07b7d8571141f7103b024513e78c..0000000000000000000000000000000000000000 --- a/magic_pdf/pre_proc/ocr_span_list_modify.py +++ /dev/null @@ -1,131 +0,0 @@ - -from magic_pdf.config.drop_tag import DropTag -from magic_pdf.config.ocr_content_type import BlockType -from magic_pdf.libs.boxbase import calculate_iou, get_minbox_if_overlap_by_ratio - - -def remove_overlaps_low_confidence_spans(spans): - dropped_spans = [] - # 删除重叠spans中置信度低的的那些 - for span1 in spans: - for span2 in spans: - if span1 != span2: - # span1 或 span2 任何一个都不应该在 dropped_spans 中 - if span1 in dropped_spans or span2 in dropped_spans: - continue - else: - if calculate_iou(span1['bbox'], span2['bbox']) > 0.9: - if span1['score'] < span2['score']: - span_need_remove = span1 - else: - span_need_remove = span2 - if ( - span_need_remove is not None - and span_need_remove not in dropped_spans - ): - dropped_spans.append(span_need_remove) - - if len(dropped_spans) > 0: - for span_need_remove in dropped_spans: - spans.remove(span_need_remove) - span_need_remove['tag'] = DropTag.SPAN_OVERLAP - - return spans, dropped_spans - - -def check_chars_is_overlap_in_span(chars): - for i in range(len(chars)): - for j in range(i + 1, len(chars)): - if calculate_iou(chars[i]['bbox'], chars[j]['bbox']) > 0.35: - return True - return False - - -def remove_x_overlapping_chars(span, median_width): - """ - Remove characters from a span that overlap significantly on the x-axis. - - Args: - median_width: - span (dict): A span containing a list of chars, each with bbox coordinates - in the format [x0, y0, x1, y1] - - Returns: - dict: The span with overlapping characters removed - """ - if 'chars' not in span or len(span['chars']) < 2: - return span - - overlap_threshold = median_width * 0.3 - - i = 0 - while i < len(span['chars']) - 1: - char1 = span['chars'][i] - char2 = span['chars'][i + 1] - - # Calculate overlap width - x_left = max(char1['bbox'][0], char2['bbox'][0]) - x_right = min(char1['bbox'][2], char2['bbox'][2]) - - if x_right > x_left: # There is overlap - overlap_width = x_right - x_left - - if overlap_width > overlap_threshold: - if char1['c'] == char2['c'] or char1['c'] == ' ' or char2['c'] == ' ': - # Determine which character to remove - width1 = char1['bbox'][2] - char1['bbox'][0] - width2 = char2['bbox'][2] - char2['bbox'][0] - if width1 < width2: - # Remove the narrower character - span['chars'].pop(i) - else: - span['chars'].pop(i + 1) - else: - i += 1 - - # Don't increment i since we need to check the new pair - else: - i += 1 - else: - i += 1 - - return span - - -def remove_overlaps_min_spans(spans): - dropped_spans = [] - # 删除重叠spans中较小的那些 - for span1 in spans: - for span2 in spans: - if span1 != span2: - # span1 或 span2 任何一个都不应该在 dropped_spans 中 - if span1 in dropped_spans or span2 in dropped_spans: - continue - else: - overlap_box = get_minbox_if_overlap_by_ratio(span1['bbox'], span2['bbox'], 0.65) - if overlap_box is not None: - span_need_remove = next((span for span in spans if span['bbox'] == overlap_box), None) - if span_need_remove is not None and span_need_remove not in dropped_spans: - dropped_spans.append(span_need_remove) - if len(dropped_spans) > 0: - for span_need_remove in dropped_spans: - spans.remove(span_need_remove) - span_need_remove['tag'] = DropTag.SPAN_OVERLAP - - return spans, dropped_spans - - -def get_qa_need_list_v2(blocks): - # 创建 images, tables, interline_equations, inline_equations 的副本 - images = [] - tables = [] - interline_equations = [] - - for block in blocks: - if block['type'] == BlockType.Image: - images.append(block) - elif block['type'] == BlockType.Table: - tables.append(block) - elif block['type'] == BlockType.InterlineEquation: - interline_equations.append(block) - return images, tables, interline_equations diff --git a/magic_pdf/pre_proc/remove_bbox_overlap.py b/magic_pdf/pre_proc/remove_bbox_overlap.py deleted file mode 100644 index 35f96a10ba30221d6fb218a5e99e17253b5e30b1..0000000000000000000000000000000000000000 --- a/magic_pdf/pre_proc/remove_bbox_overlap.py +++ /dev/null @@ -1,100 +0,0 @@ -from magic_pdf.config.drop_reason import DropReason -from magic_pdf.libs.boxbase import _is_in, _is_part_overlap - - -def _remove_overlap_between_bbox(bbox1, bbox2): - if _is_part_overlap(bbox1, bbox2): - ix0, iy0, ix1, iy1 = bbox1 - x0, y0, x1, y1 = bbox2 - - diff_x = min(x1, ix1) - max(x0, ix0) - diff_y = min(y1, iy1) - max(y0, iy0) - - if diff_y > diff_x: - if x1 >= ix1: - mid = (x0 + ix1) // 2 - ix1 = min(mid - 0.25, ix1) - x0 = max(mid + 0.25, x0) - else: - mid = (ix0 + x1) // 2 - ix0 = max(mid + 0.25, ix0) - x1 = min(mid - 0.25, x1) - else: - if y1 >= iy1: - mid = (y0 + iy1) // 2 - y0 = max(mid + 0.25, y0) - iy1 = min(iy1, mid - 0.25) - else: - mid = (iy0 + y1) // 2 - y1 = min(y1, mid - 0.25) - iy0 = max(mid + 0.25, iy0) - - if ix1 > ix0 and iy1 > iy0 and y1 > y0 and x1 > x0: - bbox1 = [ix0, iy0, ix1, iy1] - bbox2 = [x0, y0, x1, y1] - return bbox1, bbox2, None - else: - return bbox1, bbox2, DropReason.NEGATIVE_BBOX_AREA - else: - return bbox1, bbox2, None - - -def _remove_overlap_between_bboxes(arr): - drop_reasons = [] - N = len(arr) - keeps = [True] * N - res = [None] * N - for i in range(N): - for j in range(N): - if i == j: - continue - if _is_in(arr[i]['bbox'], arr[j]['bbox']): - keeps[i] = False - - for idx, v in enumerate(arr): - if not keeps[idx]: - continue - for i in range(N): - if res[i] is None: - continue - - bbox1, bbox2, drop_reason = _remove_overlap_between_bbox( - v['bbox'], res[i]['bbox'] - ) - if drop_reason is None: - v['bbox'] = bbox1 - res[i]['bbox'] = bbox2 - else: - if v['score'] > res[i]['score']: - keeps[i] = False - res[i] = None - else: - keeps[idx] = False - drop_reasons.append(drop_reason) - if keeps[idx]: - res[idx] = v - return res, drop_reasons - - -def remove_overlap_between_bbox_for_span(spans): - arr = [{'bbox': span['bbox'], 'score': span.get('score', 0.1)} for span in spans] - res, drop_reasons = _remove_overlap_between_bboxes(arr) - ret = [] - for i in range(len(res)): - if res[i] is None: - continue - spans[i]['bbox'] = res[i]['bbox'] - ret.append(spans[i]) - return ret, drop_reasons - - -def remove_overlap_between_bbox_for_block(all_bboxes): - arr = [{'bbox': bbox[:4], 'score': bbox[-1]} for bbox in all_bboxes] - res, drop_reasons = _remove_overlap_between_bboxes(arr) - ret = [] - for i in range(len(res)): - if res[i] is None: - continue - all_bboxes[i][:4] = res[i]['bbox'] - ret.append(all_bboxes[i]) - return ret, drop_reasons diff --git a/magic_pdf/resources/fasttext-langdetect/lid.176.ftz b/magic_pdf/resources/fasttext-langdetect/lid.176.ftz deleted file mode 100644 index 1fb85b357b22f67f019567f0e7003f4d49bda7a0..0000000000000000000000000000000000000000 Binary files a/magic_pdf/resources/fasttext-langdetect/lid.176.ftz and /dev/null differ diff --git a/magic_pdf/resources/model_config/model_configs.yaml b/magic_pdf/resources/model_config/model_configs.yaml deleted file mode 100644 index 0ee1aa0e8de8ef6baec4964b75d9f753b5c25da5..0000000000000000000000000000000000000000 --- a/magic_pdf/resources/model_config/model_configs.yaml +++ /dev/null @@ -1,8 +0,0 @@ -weights: - layoutlmv3: Layout/LayoutLMv3/model_final.pth - doclayout_yolo: Layout/YOLO/doclayout_yolo_docstructbench_imgsz1280_2501.pt - yolo_v8_mfd: MFD/YOLO/yolo_v8_ft.pt - unimernet_small: MFR/unimernet_hf_small_2503 - struct_eqtable: TabRec/StructEqTable - tablemaster: TabRec/TableMaster - rapid_table: TabRec/RapidTable \ No newline at end of file diff --git a/magic_pdf/resources/slanet_plus/slanet-plus.onnx b/magic_pdf/resources/slanet_plus/slanet-plus.onnx deleted file mode 100644 index d263823cdbc683f63ab7ec3d46eaa381a93b8079..0000000000000000000000000000000000000000 Binary files a/magic_pdf/resources/slanet_plus/slanet-plus.onnx and /dev/null differ diff --git a/magic_pdf/resources/yolov11-langdetect/yolo_v11_ft.pt b/magic_pdf/resources/yolov11-langdetect/yolo_v11_ft.pt deleted file mode 100644 index 8e7dbe703a4431b318d5b73724bd9693acdd2be2..0000000000000000000000000000000000000000 Binary files a/magic_pdf/resources/yolov11-langdetect/yolo_v11_ft.pt and /dev/null differ diff --git a/magic_pdf/spark/__init__.py b/magic_pdf/spark/__init__.py deleted file mode 100644 index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000 diff --git a/magic_pdf/spark/spark_api.py b/magic_pdf/spark/spark_api.py deleted file mode 100644 index 5d9faeb580840336978e8f4c3e683a7ec709dfa1..0000000000000000000000000000000000000000 --- a/magic_pdf/spark/spark_api.py +++ /dev/null @@ -1,49 +0,0 @@ -from loguru import logger - -from magic_pdf.config.drop_reason import DropReason - - -def get_data_source(jso: dict): - data_source = jso.get('data_source') - if data_source is None: - data_source = jso.get('file_source') - return data_source - - -def get_data_type(jso: dict): - data_type = jso.get('data_type') - if data_type is None: - data_type = jso.get('file_type') - return data_type - - -def get_bookid(jso: dict): - book_id = jso.get('bookid') - if book_id is None: - book_id = jso.get('original_file_id') - return book_id - - -def exception_handler(jso: dict, e): - logger.exception(e) - jso['_need_drop'] = True - jso['_drop_reason'] = DropReason.Exception - jso['_exception'] = f'ERROR: {e}' - return jso - - -def get_bookname(jso: dict): - data_source = get_data_source(jso) - file_id = jso.get('file_id') - book_name = f'{data_source}/{file_id}' - return book_name - - -def spark_json_extractor(jso: dict) -> dict: - - """从json中提取数据,返回一个dict.""" - - return { - '_pdf_type': jso['_pdf_type'], - 'model_list': jso['doc_layout_result'], - } diff --git a/magic_pdf/tools/__init__.py b/magic_pdf/tools/__init__.py deleted file mode 100644 index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000 diff --git a/magic_pdf/tools/cli.py b/magic_pdf/tools/cli.py deleted file mode 100644 index d204b26a4cba7e5088bd7b37807df44ea478fbef..0000000000000000000000000000000000000000 --- a/magic_pdf/tools/cli.py +++ /dev/null @@ -1,161 +0,0 @@ -import os -import shutil -import tempfile -from pathlib import Path - -import click -import fitz -from loguru import logger - -import magic_pdf.model as model_config -from magic_pdf.data.batch_build_dataset import batch_build_dataset -from magic_pdf.data.data_reader_writer import FileBasedDataReader -from magic_pdf.data.dataset import Dataset -from magic_pdf.libs.version import __version__ -from magic_pdf.tools.common import batch_do_parse, do_parse, parse_pdf_methods -from magic_pdf.utils.office_to_pdf import convert_file_to_pdf - -pdf_suffixes = ['.pdf'] -ms_office_suffixes = ['.ppt', '.pptx', '.doc', '.docx'] -image_suffixes = ['.png', '.jpeg', '.jpg'] - - -@click.command() -@click.version_option(__version__, - '--version', - '-v', - help='display the version and exit') -@click.option( - '-p', - '--path', - 'path', - type=click.Path(exists=True), - required=True, - help='local filepath or directory. support PDF, PPT, PPTX, DOC, DOCX, PNG, JPG files', -) -@click.option( - '-o', - '--output-dir', - 'output_dir', - type=click.Path(), - required=True, - help='output local directory', -) -@click.option( - '-m', - '--method', - 'method', - type=parse_pdf_methods, - help="""the method for parsing pdf. -ocr: using ocr technique to extract information from pdf. -txt: suitable for the text-based pdf only and outperform ocr. -auto: automatically choose the best method for parsing pdf from ocr and txt. -without method specified, auto will be used by default.""", - default='auto', -) -@click.option( - '-l', - '--lang', - 'lang', - type=str, - help=""" - Input the languages in the pdf (if known) to improve OCR accuracy. Optional. - You should input "Abbreviation" with language form url: - https://paddlepaddle.github.io/PaddleOCR/latest/en/ppocr/blog/multi_languages.html#5-support-languages-and-abbreviations - """, - default=None, -) -@click.option( - '-d', - '--debug', - 'debug_able', - type=bool, - help='Enables detailed debugging information during the execution of the CLI commands.', - default=False, -) -@click.option( - '-s', - '--start', - 'start_page_id', - type=int, - help='The starting page for PDF parsing, beginning from 0.', - default=0, -) -@click.option( - '-e', - '--end', - 'end_page_id', - type=int, - help='The ending page for PDF parsing, beginning from 0.', - default=None, -) -def cli(path, output_dir, method, lang, debug_able, start_page_id, end_page_id): - os.makedirs(output_dir, exist_ok=True) - temp_dir = tempfile.mkdtemp() - def read_fn(path: Path): - if path.suffix in ms_office_suffixes: - convert_file_to_pdf(str(path), temp_dir) - fn = os.path.join(temp_dir, f'{path.stem}.pdf') - elif path.suffix in image_suffixes: - with open(str(path), 'rb') as f: - bits = f.read() - pdf_bytes = fitz.open(stream=bits).convert_to_pdf() - fn = os.path.join(temp_dir, f'{path.stem}.pdf') - with open(fn, 'wb') as f: - f.write(pdf_bytes) - elif path.suffix in pdf_suffixes: - fn = str(path) - else: - raise Exception(f'Unknown file suffix: {path.suffix}') - - disk_rw = FileBasedDataReader(os.path.dirname(fn)) - return disk_rw.read(os.path.basename(fn)) - - def parse_doc(doc_path: Path, dataset: Dataset | None = None): - try: - file_name = str(Path(doc_path).stem) - if dataset is None: - pdf_data_or_dataset = read_fn(doc_path) - else: - pdf_data_or_dataset = dataset - do_parse( - output_dir, - file_name, - pdf_data_or_dataset, - [], - method, - debug_able, - start_page_id=start_page_id, - end_page_id=end_page_id, - lang=lang - ) - - except Exception as e: - logger.exception(e) - - if os.path.isdir(path): - doc_paths = [] - for doc_path in Path(path).glob('*'): - if doc_path.suffix in pdf_suffixes + image_suffixes + ms_office_suffixes: - if doc_path.suffix in ms_office_suffixes: - convert_file_to_pdf(str(doc_path), temp_dir) - doc_path = Path(os.path.join(temp_dir, f'{doc_path.stem}.pdf')) - elif doc_path.suffix in image_suffixes: - with open(str(doc_path), 'rb') as f: - bits = f.read() - pdf_bytes = fitz.open(stream=bits).convert_to_pdf() - fn = os.path.join(temp_dir, f'{doc_path.stem}.pdf') - with open(fn, 'wb') as f: - f.write(pdf_bytes) - doc_path = Path(fn) - doc_paths.append(doc_path) - datasets = batch_build_dataset(doc_paths, 4, lang) - batch_do_parse(output_dir, [str(doc_path.stem) for doc_path in doc_paths], datasets, method, debug_able, lang=lang) - else: - parse_doc(Path(path)) - - shutil.rmtree(temp_dir) - - -if __name__ == '__main__': - cli() diff --git a/magic_pdf/tools/cli_dev.py b/magic_pdf/tools/cli_dev.py deleted file mode 100644 index 6973d04a60ccc31cfbfbcb8562324c05aac20e84..0000000000000000000000000000000000000000 --- a/magic_pdf/tools/cli_dev.py +++ /dev/null @@ -1,149 +0,0 @@ -import json as json_parse -import os -from pathlib import Path - -import click - -import magic_pdf.model as model_config -from magic_pdf.data.data_reader_writer import FileBasedDataReader, S3DataReader -from magic_pdf.libs.config_reader import get_s3_config -from magic_pdf.libs.path_utils import (parse_s3_range_params, parse_s3path, - remove_non_official_s3_args) -from magic_pdf.libs.version import __version__ -from magic_pdf.tools.common import do_parse, parse_pdf_methods - - -def read_s3_path(s3path): - bucket, key = parse_s3path(s3path) - - s3_ak, s3_sk, s3_endpoint = get_s3_config(bucket) - s3_rw = S3DataReader('', bucket, s3_ak, s3_sk, s3_endpoint, 'auto') - may_range_params = parse_s3_range_params(s3path) - if may_range_params is None or 2 != len(may_range_params): - byte_start, byte_end = 0, -1 - else: - byte_start, byte_end = int(may_range_params[0]), int( - may_range_params[1]) - return s3_rw.read_at( - remove_non_official_s3_args(s3path), - byte_start, - byte_end, - ) - - -@click.group() -@click.version_option(__version__, '--version', '-v', help='显示版本信息') -def cli(): - pass - - -@cli.command() -@click.option( - '-j', - '--jsonl', - 'jsonl', - type=str, - help='输入 jsonl 路径,本地或者 s3 上的文件', - required=True, -) -@click.option( - '-m', - '--method', - 'method', - type=parse_pdf_methods, - help='指定解析方法。txt: 文本型 pdf 解析方法, ocr: 光学识别解析 pdf, auto: 程序智能选择解析方法', - default='auto', -) -@click.option( - '-o', - '--output-dir', - 'output_dir', - type=click.Path(), - required=True, - help='输出到本地目录', -) -def jsonl(jsonl, method, output_dir): - model_config.__use_inside_model__ = False - if jsonl.startswith('s3://'): - jso = json_parse.loads(read_s3_path(jsonl).decode('utf-8')) - else: - with open(jsonl) as f: - jso = json_parse.loads(f.readline()) - os.makedirs(output_dir, exist_ok=True) - s3_file_path = jso.get('file_location') - if s3_file_path is None: - s3_file_path = jso.get('path') - pdf_file_name = Path(s3_file_path).stem - pdf_data = read_s3_path(s3_file_path) - - print(pdf_file_name, jso, method) - do_parse( - output_dir, - pdf_file_name, - pdf_data, - jso['doc_layout_result'], - method, - False, - f_dump_content_list=True, - f_draw_model_bbox=True, - ) - - -@cli.command() -@click.option( - '-p', - '--pdf', - 'pdf', - type=click.Path(exists=True), - required=True, - help='本地 PDF 文件', -) -@click.option( - '-j', - '--json', - 'json_data', - type=click.Path(exists=True), - required=True, - help='本地模型推理出的 json 数据', -) -@click.option('-o', - '--output-dir', - 'output_dir', - type=click.Path(), - required=True, - help='本地输出目录') -@click.option( - '-m', - '--method', - 'method', - type=parse_pdf_methods, - help='指定解析方法。txt: 文本型 pdf 解析方法, ocr: 光学识别解析 pdf, auto: 程序智能选择解析方法', - default='auto', -) -def pdf(pdf, json_data, output_dir, method): - model_config.__use_inside_model__ = False - full_pdf_path = os.path.realpath(pdf) - os.makedirs(output_dir, exist_ok=True) - - def read_fn(path): - disk_rw = FileBasedDataReader(os.path.dirname(path)) - return disk_rw.read(os.path.basename(path)) - - model_json_list = json_parse.loads(read_fn(json_data).decode('utf-8')) - - file_name = str(Path(full_pdf_path).stem) - pdf_data = read_fn(full_pdf_path) - do_parse( - output_dir, - file_name, - pdf_data, - model_json_list, - method, - False, - f_dump_content_list=True, - f_draw_model_bbox=True, - ) - - -if __name__ == '__main__': - cli() diff --git a/magic_pdf/tools/common.py b/magic_pdf/tools/common.py deleted file mode 100644 index 06c52a8aa425afd3fa754a4a4ae63a49187a0349..0000000000000000000000000000000000000000 --- a/magic_pdf/tools/common.py +++ /dev/null @@ -1,340 +0,0 @@ -import os - -import click -import fitz -from loguru import logger - -import magic_pdf.model as model_config -from magic_pdf.config.enums import SupportedPdfParseMethod -from magic_pdf.config.make_content_config import DropMode, MakeMode -from magic_pdf.data.data_reader_writer import FileBasedDataWriter -from magic_pdf.data.dataset import Dataset, PymuDocDataset -from magic_pdf.libs.draw_bbox import draw_char_bbox -from magic_pdf.model.doc_analyze_by_custom_model import (batch_doc_analyze, - doc_analyze) - -# from io import BytesIO -# from pypdf import PdfReader, PdfWriter - - -def prepare_env(output_dir, pdf_file_name, method): - local_parent_dir = os.path.join(output_dir, pdf_file_name, method) - - local_image_dir = os.path.join(str(local_parent_dir), 'images') - local_md_dir = local_parent_dir - os.makedirs(local_image_dir, exist_ok=True) - os.makedirs(local_md_dir, exist_ok=True) - return local_image_dir, local_md_dir - - -# def convert_pdf_bytes_to_bytes_by_pypdf(pdf_bytes, start_page_id=0, end_page_id=None): -# # 将字节数据包装在 BytesIO 对象中 -# pdf_file = BytesIO(pdf_bytes) -# # 读取 PDF 的字节数据 -# reader = PdfReader(pdf_file) -# # 创建一个新的 PDF 写入器 -# writer = PdfWriter() -# # 将所有页面添加到新的 PDF 写入器中 -# end_page_id = end_page_id if end_page_id is not None and end_page_id >= 0 else len(reader.pages) - 1 -# if end_page_id > len(reader.pages) - 1: -# logger.warning("end_page_id is out of range, use pdf_docs length") -# end_page_id = len(reader.pages) - 1 -# for i, page in enumerate(reader.pages): -# if start_page_id <= i <= end_page_id: -# writer.add_page(page) -# # 创建一个字节缓冲区来存储输出的 PDF 数据 -# output_buffer = BytesIO() -# # 将 PDF 写入字节缓冲区 -# writer.write(output_buffer) -# # 获取字节缓冲区的内容 -# converted_pdf_bytes = output_buffer.getvalue() -# return converted_pdf_bytes - - -def convert_pdf_bytes_to_bytes_by_pymupdf(pdf_bytes, start_page_id=0, end_page_id=None): - document = fitz.open('pdf', pdf_bytes) - output_document = fitz.open() - end_page_id = ( - end_page_id - if end_page_id is not None and end_page_id >= 0 - else len(document) - 1 - ) - if end_page_id > len(document) - 1: - logger.warning('end_page_id is out of range, use pdf_docs length') - end_page_id = len(document) - 1 - output_document.insert_pdf(document, from_page=start_page_id, to_page=end_page_id) - output_bytes = output_document.tobytes() - return output_bytes - - -def _do_parse( - output_dir, - pdf_file_name, - pdf_bytes_or_dataset, - model_list, - parse_method, - debug_able=False, - f_draw_span_bbox=True, - f_draw_layout_bbox=True, - f_dump_md=True, - f_dump_middle_json=True, - f_dump_model_json=True, - f_dump_orig_pdf=True, - f_dump_content_list=True, - f_make_md_mode=MakeMode.MM_MD, - f_draw_model_bbox=False, - f_draw_line_sort_bbox=False, - f_draw_char_bbox=False, - start_page_id=0, - end_page_id=None, - lang=None, - layout_model=None, - formula_enable=None, - table_enable=None, -): - from magic_pdf.operators.models import InferenceResult - if debug_able: - logger.warning('debug mode is on') - f_draw_model_bbox = True - f_draw_line_sort_bbox = True - # f_draw_char_bbox = True - - if isinstance(pdf_bytes_or_dataset, bytes): - pdf_bytes = convert_pdf_bytes_to_bytes_by_pymupdf( - pdf_bytes_or_dataset, start_page_id, end_page_id - ) - ds = PymuDocDataset(pdf_bytes, lang=lang) - else: - ds = pdf_bytes_or_dataset - pdf_bytes = ds._raw_data - local_image_dir, local_md_dir = prepare_env(output_dir, pdf_file_name, parse_method) - - image_writer, md_writer = FileBasedDataWriter(local_image_dir), FileBasedDataWriter(local_md_dir) - image_dir = str(os.path.basename(local_image_dir)) - - if len(model_list) == 0: - if model_config.__use_inside_model__: - if parse_method == 'auto': - if ds.classify() == SupportedPdfParseMethod.TXT: - infer_result = ds.apply( - doc_analyze, - ocr=False, - lang=ds._lang, - layout_model=layout_model, - formula_enable=formula_enable, - table_enable=table_enable, - ) - pipe_result = infer_result.pipe_txt_mode( - image_writer, debug_mode=True, lang=ds._lang - ) - else: - infer_result = ds.apply( - doc_analyze, - ocr=True, - lang=ds._lang, - layout_model=layout_model, - formula_enable=formula_enable, - table_enable=table_enable, - ) - pipe_result = infer_result.pipe_ocr_mode( - image_writer, debug_mode=True, lang=ds._lang - ) - - elif parse_method == 'txt': - infer_result = ds.apply( - doc_analyze, - ocr=False, - lang=ds._lang, - layout_model=layout_model, - formula_enable=formula_enable, - table_enable=table_enable, - ) - pipe_result = infer_result.pipe_txt_mode( - image_writer, debug_mode=True, lang=ds._lang - ) - elif parse_method == 'ocr': - infer_result = ds.apply( - doc_analyze, - ocr=True, - lang=ds._lang, - layout_model=layout_model, - formula_enable=formula_enable, - table_enable=table_enable, - ) - pipe_result = infer_result.pipe_ocr_mode( - image_writer, debug_mode=True, lang=ds._lang - ) - else: - logger.error('unknown parse method') - exit(1) - else: - logger.error('need model list input') - exit(2) - else: - - infer_result = InferenceResult(model_list, ds) - if parse_method == 'ocr': - pipe_result = infer_result.pipe_ocr_mode( - image_writer, debug_mode=True, lang=ds._lang - ) - elif parse_method == 'txt': - pipe_result = infer_result.pipe_txt_mode( - image_writer, debug_mode=True, lang=ds._lang - ) - else: - if ds.classify() == SupportedPdfParseMethod.TXT: - pipe_result = infer_result.pipe_txt_mode( - image_writer, debug_mode=True, lang=ds._lang - ) - else: - pipe_result = infer_result.pipe_ocr_mode( - image_writer, debug_mode=True, lang=ds._lang - ) - - - if f_draw_model_bbox: - infer_result.draw_model( - os.path.join(local_md_dir, f'{pdf_file_name}_model.pdf') - ) - - if f_draw_layout_bbox: - pipe_result.draw_layout( - os.path.join(local_md_dir, f'{pdf_file_name}_layout.pdf') - ) - if f_draw_span_bbox: - pipe_result.draw_span(os.path.join(local_md_dir, f'{pdf_file_name}_spans.pdf')) - - if f_draw_line_sort_bbox: - pipe_result.draw_line_sort( - os.path.join(local_md_dir, f'{pdf_file_name}_line_sort.pdf') - ) - - if f_draw_char_bbox: - draw_char_bbox(pdf_bytes, local_md_dir, f'{pdf_file_name}_char_bbox.pdf') - - if f_dump_md: - pipe_result.dump_md( - md_writer, - f'{pdf_file_name}.md', - image_dir, - drop_mode=DropMode.NONE, - md_make_mode=f_make_md_mode, - ) - - if f_dump_middle_json: - pipe_result.dump_middle_json(md_writer, f'{pdf_file_name}_middle.json') - - if f_dump_model_json: - infer_result.dump_model(md_writer, f'{pdf_file_name}_model.json') - - if f_dump_orig_pdf: - md_writer.write( - f'{pdf_file_name}_origin.pdf', - pdf_bytes, - ) - - if f_dump_content_list: - pipe_result.dump_content_list( - md_writer, - f'{pdf_file_name}_content_list.json', - image_dir - ) - - logger.info(f'local output dir is {local_md_dir}') - -def do_parse( - output_dir, - pdf_file_name, - pdf_bytes_or_dataset, - model_list, - parse_method, - debug_able=False, - f_draw_span_bbox=True, - f_draw_layout_bbox=True, - f_dump_md=True, - f_dump_middle_json=True, - f_dump_model_json=True, - f_dump_orig_pdf=True, - f_dump_content_list=True, - f_make_md_mode=MakeMode.MM_MD, - f_draw_model_bbox=False, - f_draw_line_sort_bbox=False, - f_draw_char_bbox=False, - start_page_id=0, - end_page_id=None, - lang=None, - layout_model=None, - formula_enable=None, - table_enable=None, -): - parallel_count = 1 - if os.environ.get('MINERU_PARALLEL_INFERENCE_COUNT'): - parallel_count = int(os.environ['MINERU_PARALLEL_INFERENCE_COUNT']) - - if parallel_count > 1: - if isinstance(pdf_bytes_or_dataset, bytes): - pdf_bytes = convert_pdf_bytes_to_bytes_by_pymupdf( - pdf_bytes_or_dataset, start_page_id, end_page_id - ) - ds = PymuDocDataset(pdf_bytes, lang=lang) - else: - ds = pdf_bytes_or_dataset - batch_do_parse(output_dir, [pdf_file_name], [ds], parse_method, debug_able, f_draw_span_bbox=f_draw_span_bbox, f_draw_layout_bbox=f_draw_layout_bbox, f_dump_md=f_dump_md, f_dump_middle_json=f_dump_middle_json, f_dump_model_json=f_dump_model_json, f_dump_orig_pdf=f_dump_orig_pdf, f_dump_content_list=f_dump_content_list, f_make_md_mode=f_make_md_mode, f_draw_model_bbox=f_draw_model_bbox, f_draw_line_sort_bbox=f_draw_line_sort_bbox, f_draw_char_bbox=f_draw_char_bbox, lang=lang) - else: - _do_parse(output_dir, pdf_file_name, pdf_bytes_or_dataset, model_list, parse_method, debug_able, start_page_id=start_page_id, end_page_id=end_page_id, lang=lang, layout_model=layout_model, formula_enable=formula_enable, table_enable=table_enable, f_draw_span_bbox=f_draw_span_bbox, f_draw_layout_bbox=f_draw_layout_bbox, f_dump_md=f_dump_md, f_dump_middle_json=f_dump_middle_json, f_dump_model_json=f_dump_model_json, f_dump_orig_pdf=f_dump_orig_pdf, f_dump_content_list=f_dump_content_list, f_make_md_mode=f_make_md_mode, f_draw_model_bbox=f_draw_model_bbox, f_draw_line_sort_bbox=f_draw_line_sort_bbox, f_draw_char_bbox=f_draw_char_bbox) - - -def batch_do_parse( - output_dir, - pdf_file_names: list[str], - pdf_bytes_or_datasets: list[bytes | Dataset], - parse_method, - debug_able=False, - f_draw_span_bbox=True, - f_draw_layout_bbox=True, - f_dump_md=True, - f_dump_middle_json=True, - f_dump_model_json=True, - f_dump_orig_pdf=True, - f_dump_content_list=True, - f_make_md_mode=MakeMode.MM_MD, - f_draw_model_bbox=False, - f_draw_line_sort_bbox=False, - f_draw_char_bbox=False, - lang=None, - layout_model=None, - formula_enable=None, - table_enable=None, -): - dss = [] - for v in pdf_bytes_or_datasets: - if isinstance(v, bytes): - dss.append(PymuDocDataset(v, lang=lang)) - else: - dss.append(v) - - infer_results = batch_doc_analyze(dss, parse_method, lang=lang, layout_model=layout_model, formula_enable=formula_enable, table_enable=table_enable) - for idx, infer_result in enumerate(infer_results): - _do_parse( - output_dir = output_dir, - pdf_file_name = pdf_file_names[idx], - pdf_bytes_or_dataset = dss[idx], - model_list = infer_result.get_infer_res(), - parse_method = parse_method, - debug_able = debug_able, - f_draw_span_bbox = f_draw_span_bbox, - f_draw_layout_bbox = f_draw_layout_bbox, - f_dump_md=f_dump_md, - f_dump_middle_json=f_dump_middle_json, - f_dump_model_json=f_dump_model_json, - f_dump_orig_pdf=f_dump_orig_pdf, - f_dump_content_list=f_dump_content_list, - f_make_md_mode=MakeMode.MM_MD, - f_draw_model_bbox=f_draw_model_bbox, - f_draw_line_sort_bbox=f_draw_line_sort_bbox, - f_draw_char_bbox=f_draw_char_bbox, - lang=lang, - ) - - -parse_pdf_methods = click.Choice(['ocr', 'txt', 'auto']) diff --git a/magic_pdf/utils/__init__.py b/magic_pdf/utils/__init__.py deleted file mode 100644 index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000 diff --git a/magic_pdf/utils/annotations.py b/magic_pdf/utils/annotations.py deleted file mode 100644 index 898d88033b724b0083a6a2124b8f805cabadd104..0000000000000000000000000000000000000000 --- a/magic_pdf/utils/annotations.py +++ /dev/null @@ -1,11 +0,0 @@ - -from loguru import logger - - -def ImportPIL(f): - try: - import PIL # noqa: F401 - except ImportError: - logger.error('Pillow not installed, please install by pip.') - exit(1) - return f diff --git a/magic_pdf/utils/office_to_pdf.py b/magic_pdf/utils/office_to_pdf.py deleted file mode 100644 index b6d03daec6bb900f470041c096cd8d943d27382c..0000000000000000000000000000000000000000 --- a/magic_pdf/utils/office_to_pdf.py +++ /dev/null @@ -1,115 +0,0 @@ -import os -import subprocess -import platform -from pathlib import Path -import shutil - -from loguru import logger - - -class ConvertToPdfError(Exception): - def __init__(self, msg): - self.msg = msg - super().__init__(self.msg) - - -def check_fonts_installed(): - """Check if required Chinese fonts are installed.""" - system_type = platform.system() - - if system_type in ['Windows', 'Darwin']: - pass - else: - # Linux: use fc-list - try: - output = subprocess.check_output(['fc-list', ':lang=zh'], encoding='utf-8') - if output.strip(): # 只要有任何输出(非空) - return True - else: - logger.warning( - f"No Chinese fonts were detected, the converted document may not display Chinese content properly." - ) - except Exception: - pass - - -def get_soffice_command(): - """Return the path to LibreOffice's soffice executable depending on the platform.""" - system_type = platform.system() - - # First check if soffice is in PATH - soffice_path = shutil.which('soffice') - if soffice_path: - return soffice_path - - if system_type == 'Windows': - # Check common installation paths - possible_paths = [ - Path(os.environ.get('PROGRAMFILES', 'C:/Program Files')) / 'LibreOffice/program/soffice.exe', - Path(os.environ.get('PROGRAMFILES(X86)', 'C:/Program Files (x86)')) / 'LibreOffice/program/soffice.exe', - Path('C:/Program Files/LibreOffice/program/soffice.exe'), - Path('C:/Program Files (x86)/LibreOffice/program/soffice.exe') - ] - - # Check other drives for windows - for drive in ['C:', 'D:', 'E:', 'F:', 'G:', 'H:']: - possible_paths.append(Path(f"{drive}/LibreOffice/program/soffice.exe")) - - for path in possible_paths: - if path.exists(): - return str(path) - - raise ConvertToPdfError( - "LibreOffice not found. Please install LibreOffice from https://www.libreoffice.org/ " - "or ensure soffice.exe is in your PATH environment variable." - ) - else: - # For Linux/macOS, provide installation instructions if not found - try: - # Try to find soffice in standard locations - possible_paths = [ - '/usr/bin/soffice', - '/usr/local/bin/soffice', - '/opt/libreoffice/program/soffice', - '/Applications/LibreOffice.app/Contents/MacOS/soffice' - ] - for path in possible_paths: - if os.path.exists(path): - return path - - raise ConvertToPdfError( - "LibreOffice not found. Please install it:\n" - " - Ubuntu/Debian: sudo apt-get install libreoffice\n" - " - CentOS/RHEL: sudo yum install libreoffice\n" - " - macOS: brew install libreoffice or download from https://www.libreoffice.org/\n" - " - Or ensure soffice is in your PATH environment variable." - ) - except Exception as e: - raise ConvertToPdfError(f"Error locating LibreOffice: {str(e)}") - - -def convert_file_to_pdf(input_path, output_dir): - """Convert a single document (ppt, doc, etc.) to PDF.""" - if not os.path.isfile(input_path): - raise FileNotFoundError(f"The input file {input_path} does not exist.") - - os.makedirs(output_dir, exist_ok=True) - - check_fonts_installed() - - soffice_cmd = get_soffice_command() - - cmd = [ - soffice_cmd, - '--headless', - '--norestore', - '--invisible', - '--convert-to', 'pdf', - '--outdir', str(output_dir), - str(input_path) - ] - - process = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) - - if process.returncode != 0: - raise ConvertToPdfError(f"LibreOffice convert failed: {process.stderr.decode()}") diff --git a/magic_pdf/model/sub_modules/language_detection/__init__.py b/mineru/api/__init__.py similarity index 100% rename from magic_pdf/model/sub_modules/language_detection/__init__.py rename to mineru/api/__init__.py diff --git a/magic_pdf/model/sub_modules/language_detection/yolov11/__init__.py b/mineru/backend/__init__.py similarity index 100% rename from magic_pdf/model/sub_modules/language_detection/yolov11/__init__.py rename to mineru/backend/__init__.py diff --git a/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/__init__.py b/mineru/backend/pipeline/__init__.py similarity index 100% rename from magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/__init__.py rename to mineru/backend/pipeline/__init__.py diff --git a/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/tools/infer/__init__.py b/mineru/backend/vlm/__init__.py similarity index 100% rename from magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/tools/infer/__init__.py rename to mineru/backend/vlm/__init__.py diff --git a/magic_pdf/post_proc/__init__.py b/mineru/cli/__init__.py similarity index 100% rename from magic_pdf/post_proc/__init__.py rename to mineru/cli/__init__.py diff --git a/mineru/data/__init__.py b/mineru/data/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..1e17167ceda21a510d4486bf1711c9a72bf414db --- /dev/null +++ b/mineru/data/__init__.py @@ -0,0 +1 @@ +# Copyright (c) Opendatalab. All rights reserved. diff --git a/mineru/libs/__init__.py b/mineru/libs/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..1e17167ceda21a510d4486bf1711c9a72bf414db --- /dev/null +++ b/mineru/libs/__init__.py @@ -0,0 +1 @@ +# Copyright (c) Opendatalab. All rights reserved. diff --git a/mineru/resources/__init__.py b/mineru/resources/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..1e17167ceda21a510d4486bf1711c9a72bf414db --- /dev/null +++ b/mineru/resources/__init__.py @@ -0,0 +1 @@ +# Copyright (c) Opendatalab. All rights reserved. diff --git a/mineru/utils/__init__.py b/mineru/utils/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..1e17167ceda21a510d4486bf1711c9a72bf414db --- /dev/null +++ b/mineru/utils/__init__.py @@ -0,0 +1 @@ +# Copyright (c) Opendatalab. All rights reserved. diff --git a/projects/gradio_app/app.py b/projects/gradio_app/app.py index 180fd7e2101f4c8ce49f121a81e54e1349b7e3d0..c9d32c1d9039d33617e5db5b3150122c527e6ada 100644 --- a/projects/gradio_app/app.py +++ b/projects/gradio_app/app.py @@ -40,7 +40,7 @@ def parse_pdf(doc_path, output_dir, end_page_id, is_ocr, layout_mode, formula_en pdf_data, [], parse_method, - False, + True, end_page_id=end_page_id, layout_model=layout_mode, formula_enable=formula_enable,