Commit f59c7272 authored by yangzhong's avatar yangzhong
Browse files

原始tar包转换为openwebtext.jsonl的脚本

parent d444a97a
import os
import re
import tarfile
import lzma
import json
from tqdm import tqdm
# 配置路径(根据你的实际路径修改)
SUBSETS_DIR = "/models/datasets/openwebtext/subsets" # 存放tar包的目录
OUTPUT_JSONL = "/models/datasets/openwebtext/openwebtext.jsonl" # 输出的jsonl文件
def process_tar_subsets():
# 获取所有tar分包(subset00到subset20)
tar_files = [f for f in os.listdir(SUBSETS_DIR) if f.startswith("urlsf_subset") and f.endswith(".tar")]
tar_files.sort() # 按序号排序
with open(OUTPUT_JSONL, "w", encoding="utf-8") as out_f:
for tar_name in tqdm(tar_files, desc="处理tar包"):
tar_path = os.path.join(SUBSETS_DIR, tar_name)
# 打开tar包
with tarfile.open(tar_path, "r") as tar:
# 遍历tar包内的所有xz文件
for xz_info in tar.getmembers():
if not xz_info.name.endswith(".xz"):
continue # 只处理xz压缩文件
# 读取xz文件内容
with tar.extractfile(xz_info) as xz_f:
# 解压xz文件(内部是txt文件)
with lzma.open(xz_f, "rt", encoding="utf-8") as txt_f:
# 读取文本并清理格式(同原脚本逻辑)
text = txt_f.read()
text = re.sub("\n\n\n+", "\n\n", text).strip() # 合并多余空行
if text: # 跳过空文本
# 写入jsonl(每行一个{"text": "..."})
json.dump({"text": text}, out_f, ensure_ascii=False)
out_f.write("\n")
if __name__ == "__main__":
process_tar_subsets()
print(f"已生成 {OUTPUT_JSONL}")
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment