原始tar包转换为openwebtext.jsonl的脚本

f59c7272 · yangzhong · d444a97a · f59c7272
Commit f59c7272 authored Oct 30, 2025 by yangzhong
Hide whitespace changes
Inline Side-by-side

Showing with 40 additions and 0 deletions

convert_openwebtext_jsonl.py convert_openwebtext_jsonl.py +40 -0

No files found.
--- a/convert_openwebtext_jsonl.py
+++ b/convert_openwebtext_jsonl.py
+import os
+import re
+import tarfile
+import lzma
+import json
+from tqdm import tqdm
+# 配置路径（根据你的实际路径修改）
+SUBSETS_DIR = "/models/datasets/openwebtext/subsets"  # 存放tar包的目录
+OUTPUT_JSONL = "/models/datasets/openwebtext/openwebtext.jsonl"  # 输出的jsonl文件
+def process_tar_subsets():
+    # 获取所有tar分包（subset00到subset20）
+    tar_files = [f for f in os.listdir(SUBSETS_DIR) if f.startswith("urlsf_subset") and f.endswith(".tar")]
+    tar_files.sort()  # 按序号排序
+    with open(OUTPUT_JSONL, "w", encoding="utf-8") as out_f:
+        for tar_name in tqdm(tar_files, desc="处理tar包"):
+            tar_path = os.path.join(SUBSETS_DIR, tar_name)
+            # 打开tar包
+            with tarfile.open(tar_path, "r") as tar:
+                # 遍历tar包内的所有xz文件
+                for xz_info in tar.getmembers():
+                    if not xz_info.name.endswith(".xz"):
+                        continue  # 只处理xz压缩文件
+                    # 读取xz文件内容
+                    with tar.extractfile(xz_info) as xz_f:
+                        # 解压xz文件（内部是txt文件）
+                        with lzma.open(xz_f, "rt", encoding="utf-8") as txt_f:
+                            # 读取文本并清理格式（同原脚本逻辑）
+                            text = txt_f.read()
+                            text = re.sub("\n\n\n+", "\n\n", text).strip()  # 合并多余空行
+                            if text:  # 跳过空文本
+                                # 写入jsonl（每行一个{"text": "..."}）
+                                json.dump({"text": text}, out_f, ensure_ascii=False)
+                                out_f.write("\n")
+if __name__ == "__main__":
+    process_tar_subsets()
+    print(f"已生成 {OUTPUT_JSONL}")