import os import re import tarfile import lzma import json from tqdm import tqdm # 配置路径(根据你的实际路径修改) SUBSETS_DIR = "/models/datasets/openwebtext/subsets" # 存放tar包的目录 OUTPUT_JSONL = "/models/datasets/openwebtext/openwebtext.jsonl" # 输出的jsonl文件 def process_tar_subsets(): # 获取所有tar分包(subset00到subset20) tar_files = [f for f in os.listdir(SUBSETS_DIR) if f.startswith("urlsf_subset") and f.endswith(".tar")] tar_files.sort() # 按序号排序 with open(OUTPUT_JSONL, "w", encoding="utf-8") as out_f: for tar_name in tqdm(tar_files, desc="处理tar包"): tar_path = os.path.join(SUBSETS_DIR, tar_name) # 打开tar包 with tarfile.open(tar_path, "r") as tar: # 遍历tar包内的所有xz文件 for xz_info in tar.getmembers(): if not xz_info.name.endswith(".xz"): continue # 只处理xz压缩文件 # 读取xz文件内容 with tar.extractfile(xz_info) as xz_f: # 解压xz文件(内部是txt文件) with lzma.open(xz_f, "rt", encoding="utf-8") as txt_f: # 读取文本并清理格式(同原脚本逻辑) text = txt_f.read() text = re.sub("\n\n\n+", "\n\n", text).strip() # 合并多余空行 if text: # 跳过空文本 # 写入jsonl(每行一个{"text": "..."}) json.dump({"text": text}, out_f, ensure_ascii=False) out_f.write("\n") if __name__ == "__main__": process_tar_subsets() print(f"已生成 {OUTPUT_JSONL}")