Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
wxj
NeMo
Commits
541376a8
"src/nni_manager/vscode:/vscode.git/clone" did not exist on "ccc84eb02c8bf0763b1bfd07192f8e6dfb621dcf"
Commit
541376a8
authored
Jan 08, 2025
by
wxj
Browse files
添加dolly-15k数据集处理脚本
parent
0651a856
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
145 additions
and
2 deletions
+145
-2
K100AI_finetune.sh
K100AI_finetune.sh
+2
-2
NeMo-2.0.0.rc0.beta/scripts/dataset_processing/nlp/dolly_dataprep/dolly_dataspilt.py
.../dataset_processing/nlp/dolly_dataprep/dolly_dataspilt.py
+64
-0
NeMo-2.0.0.rc0.beta/scripts/dataset_processing/nlp/dolly_dataprep/preprocess.py
...ripts/dataset_processing/nlp/dolly_dataprep/preprocess.py
+79
-0
No files found.
K100AI_finetune.sh
View file @
541376a8
...
...
@@ -35,11 +35,11 @@ VALID_NAMES="[databricks-dolly-15k]"
# CONCAT_SAMPLING_PROBS="[0.3,0.7]" # "[1]" # 只有一个数据集设置为1
CONCAT_SAMPLING_PROBS
=
"[1]"
# 可能需要导入环境变量
# 可能需要导入
的
环境变量
export
LD_PRELOAD
=
/usr/local/lib/python3.10/site-packages/transformer_engine.libs/libgalaxyhip-8e217ef3.so.5.2.24472.1059-0a6afed7
# 运行训练脚本
torchrun
--nproc_per_node
8
\
/workspace/nemo_main
/NeMo-2.0.0.rc0.beta/examples/nlp/language_modeling/tuning/megatron_gpt_finetuning.py
\
.
/NeMo-2.0.0.rc0.beta/examples/nlp/language_modeling/tuning/megatron_gpt_finetuning.py
\
trainer.precision
=
bf16
\
trainer.devices
=
8
\
trainer.num_nodes
=
1
\
...
...
NeMo-2.0.0.rc0.beta/scripts/dataset_processing/nlp/dolly_dataprep/dolly_dataspilt.py
0 → 100644
View file @
541376a8
import
json
import
random
from
argparse
import
ArgumentParser
from
pathlib
import
Path
def
main
(
path_to_data
):
root
=
Path
(
path_to_data
)
input_file
=
root
/
"databricks-dolly-15k-output.jsonl"
training_output_file
=
root
/
"training.jsonl"
validation_output_file
=
root
/
"validation.jsonl"
test_output_file
=
root
/
"test.jsonl"
# Specify the proportion of data for training and validation
train_proportion
=
0.80
validation_proportion
=
0.15
test_proportion
=
0.05
# Read the JSONL file and shuffle the JSON objects
with
open
(
input_file
,
"r"
)
as
f
:
lines
=
f
.
readlines
()
random
.
shuffle
(
lines
)
# Calculate split indices
total_lines
=
len
(
lines
)
train_index
=
int
(
total_lines
*
train_proportion
)
val_index
=
int
(
total_lines
*
validation_proportion
)
# Distribute JSON objects into training and validation sets
train_data
=
lines
[:
train_index
]
validation_data
=
lines
[
train_index
:
train_index
+
val_index
]
test_data
=
lines
[
train_index
+
val_index
:]
# Write JSON objects to training file
with
open
(
training_output_file
,
"w"
)
as
f
:
for
line
in
train_data
:
f
.
write
(
line
.
strip
()
+
"
\n
"
)
# Write JSON objects to validation file
with
open
(
validation_output_file
,
"w"
)
as
f
:
for
line
in
validation_data
:
f
.
write
(
line
.
strip
()
+
"
\n
"
)
# Write JSON objects to training file
with
open
(
test_output_file
,
"w"
)
as
f
:
for
line
in
test_data
:
f
.
write
(
line
.
strip
()
+
"
\n
"
)
def
get_args
():
parser
=
ArgumentParser
()
parser
.
add_argument
(
"--input"
,
type
=
str
,
required
=
True
,
help
=
"Path to jsonl dataset you want to prepare."
,
)
args
=
parser
.
parse_args
()
return
args
if
__name__
==
"__main__"
:
args
=
get_args
()
path_to_data
=
args
.
input
main
(
path_to_data
)
\ No newline at end of file
NeMo-2.0.0.rc0.beta/scripts/dataset_processing/nlp/dolly_dataprep/preprocess.py
0 → 100644
View file @
541376a8
# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Dolly data preprocessing.
Example usage:
python preprocess.py --input=<path/to/data/file>
"""
import
json
from
argparse
import
ArgumentParser
import
numpy
as
np
def
to_jsonl
(
path_to_data
):
print
(
f
"Preprocessing data to jsonl format..."
)
output_path
=
f
"
{
path_to_data
.
split
(
'.'
)[
0
]
}
-output.jsonl"
with
open
(
path_to_data
,
"r"
)
as
f
,
open
(
output_path
,
"w"
)
as
g
:
for
line
in
f
:
line
=
json
.
loads
(
line
)
context
=
line
[
"context"
].
strip
()
if
context
!=
""
:
# Randomize context and instruction order.
context_first
=
np
.
random
.
randint
(
0
,
2
)
==
0
if
context_first
:
instruction
=
line
[
"instruction"
].
strip
()
assert
instruction
!=
""
input
=
f
"
{
context
}
\n\n
{
instruction
}
"
output
=
line
[
"response"
]
else
:
instruction
=
line
[
"instruction"
].
strip
()
assert
instruction
!=
""
input
=
f
"
{
instruction
}
\n\n
{
context
}
"
output
=
line
[
"response"
]
else
:
input
=
line
[
"instruction"
]
output
=
line
[
"response"
]
g
.
write
(
json
.
dumps
(
{
"input"
:
input
,
"output"
:
output
,
"category"
:
line
[
"category"
]}
)
+
"
\n
"
)
print
(
f
"Data was successfully preprocessed and saved by
{
output_path
}
."
)
def
get_args
():
parser
=
ArgumentParser
()
parser
.
add_argument
(
"--input"
,
type
=
str
,
required
=
True
,
help
=
"Path to jsonl dataset you want to prepare."
,
)
args
=
parser
.
parse_args
()
return
args
def
main
():
args
=
get_args
()
path_to_data
=
args
.
input
to_jsonl
(
path_to_data
)
if
__name__
==
"__main__"
:
main
()
\ No newline at end of file
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment