count_pdfs.py 1.67 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
import os


def find_pdfs_and_save_to_txt(directory, output_file):
    pdf_paths = []

    # Traverse the directory and find all PDF files
    for root, dirs, files in os.walk(directory):
        for file in files:
            if file.lower().endswith('.pdf'):
                pdf_paths.append(os.path.join(root, file))

    # Write all PDF paths to the output file
    with open(output_file, 'w',encoding='utf-8') as f:
        for path in pdf_paths:
            f.write(path + '\n')

    return len(pdf_paths)

#
# # Use '.' for the current directory, and specify output file
directory = '/home/practice/rag/DCU知识库文档集合'
output_file = 'pdf_paths.txt'
pdf_count = find_pdfs_and_save_to_txt(directory, output_file)
#
print(f"Found {pdf_count} PDF files. Paths are saved in '{output_file}'.")



def remove_duplicate_lines(file_path):
    lines_seen = set()  # 用于保存已经看到的唯一行
    unique_lines = []  # 用于保存不重复的行

    # 读取文件,并过滤重复的行
    with open(file_path, 'r',encoding='utf-8') as f:
        for line in f:
            line = line.strip()  # 去掉行首尾的空白符
            if line not in lines_seen:
                unique_lines.append(line)
                lines_seen.add(line)
    out_path = file_path.split('.txt')[0] + '_uq.txt'
    # 将不重复的行写回文件
    with open(out_path, 'w',encoding='utf-8') as f:
        for line in unique_lines:
            f.write(line + '\n')

    # 返回不重复行的数量
    return len(unique_lines)


# 指定要处理的文件路径
file_path = 'pdf_paths.txt'
unique_line_count = remove_duplicate_lines(file_path)

print(f"Number of unique lines: {unique_line_count}")