Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
Ecological Empowerment
megatron-lm_openwebtext
Commits
f59c7272
Commit
f59c7272
authored
Oct 30, 2025
by
yangzhong
Browse files
原始tar包转换为openwebtext.jsonl的脚本
parent
d444a97a
Changes
1
Show whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
40 additions
and
0 deletions
+40
-0
convert_openwebtext_jsonl.py
convert_openwebtext_jsonl.py
+40
-0
No files found.
convert_openwebtext_jsonl.py
0 → 100644
View file @
f59c7272
import
os
import
re
import
tarfile
import
lzma
import
json
from
tqdm
import
tqdm
# 配置路径(根据你的实际路径修改)
SUBSETS_DIR
=
"/models/datasets/openwebtext/subsets"
# 存放tar包的目录
OUTPUT_JSONL
=
"/models/datasets/openwebtext/openwebtext.jsonl"
# 输出的jsonl文件
def
process_tar_subsets
():
# 获取所有tar分包(subset00到subset20)
tar_files
=
[
f
for
f
in
os
.
listdir
(
SUBSETS_DIR
)
if
f
.
startswith
(
"urlsf_subset"
)
and
f
.
endswith
(
".tar"
)]
tar_files
.
sort
()
# 按序号排序
with
open
(
OUTPUT_JSONL
,
"w"
,
encoding
=
"utf-8"
)
as
out_f
:
for
tar_name
in
tqdm
(
tar_files
,
desc
=
"处理tar包"
):
tar_path
=
os
.
path
.
join
(
SUBSETS_DIR
,
tar_name
)
# 打开tar包
with
tarfile
.
open
(
tar_path
,
"r"
)
as
tar
:
# 遍历tar包内的所有xz文件
for
xz_info
in
tar
.
getmembers
():
if
not
xz_info
.
name
.
endswith
(
".xz"
):
continue
# 只处理xz压缩文件
# 读取xz文件内容
with
tar
.
extractfile
(
xz_info
)
as
xz_f
:
# 解压xz文件(内部是txt文件)
with
lzma
.
open
(
xz_f
,
"rt"
,
encoding
=
"utf-8"
)
as
txt_f
:
# 读取文本并清理格式(同原脚本逻辑)
text
=
txt_f
.
read
()
text
=
re
.
sub
(
"
\n\n\n
+"
,
"
\n\n
"
,
text
).
strip
()
# 合并多余空行
if
text
:
# 跳过空文本
# 写入jsonl(每行一个{"text": "..."})
json
.
dump
({
"text"
:
text
},
out_f
,
ensure_ascii
=
False
)
out_f
.
write
(
"
\n
"
)
if
__name__
==
"__main__"
:
process_tar_subsets
()
print
(
f
"已生成
{
OUTPUT_JSONL
}
"
)
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment