import os def find_pdfs_and_save_to_txt(directory, output_file): pdf_paths = [] # Traverse the directory and find all PDF files for root, dirs, files in os.walk(directory): for file in files: if file.lower().endswith('.pdf'): pdf_paths.append(os.path.join(root, file)) # Write all PDF paths to the output file with open(output_file, 'w',encoding='utf-8') as f: for path in pdf_paths: f.write(path + '\n') return len(pdf_paths) # # # Use '.' for the current directory, and specify output file directory = '/home/practice/rag/DCU知识库文档集合' output_file = 'pdf_paths.txt' pdf_count = find_pdfs_and_save_to_txt(directory, output_file) # print(f"Found {pdf_count} PDF files. Paths are saved in '{output_file}'.") def remove_duplicate_lines(file_path): lines_seen = set() # 用于保存已经看到的唯一行 unique_lines = [] # 用于保存不重复的行 # 读取文件,并过滤重复的行 with open(file_path, 'r',encoding='utf-8') as f: for line in f: line = line.strip() # 去掉行首尾的空白符 if line not in lines_seen: unique_lines.append(line) lines_seen.add(line) out_path = file_path.split('.txt')[0] + '_uq.txt' # 将不重复的行写回文件 with open(out_path, 'w',encoding='utf-8') as f: for line in unique_lines: f.write(line + '\n') # 返回不重复行的数量 return len(unique_lines) # 指定要处理的文件路径 file_path = 'pdf_paths.txt' unique_line_count = remove_duplicate_lines(file_path) print(f"Number of unique lines: {unique_line_count}")