Commit 541376a8 authored by wxj's avatar wxj
Browse files

添加dolly-15k数据集处理脚本

parent 0651a856
......@@ -35,11 +35,11 @@ VALID_NAMES="[databricks-dolly-15k]"
# CONCAT_SAMPLING_PROBS="[0.3,0.7]" # "[1]" # 只有一个数据集设置为1
CONCAT_SAMPLING_PROBS="[1]"
# 可能需要导入环境变量
# 可能需要导入环境变量
export LD_PRELOAD=/usr/local/lib/python3.10/site-packages/transformer_engine.libs/libgalaxyhip-8e217ef3.so.5.2.24472.1059-0a6afed7
# 运行训练脚本
torchrun --nproc_per_node 8 \
/workspace/nemo_main/NeMo-2.0.0.rc0.beta/examples/nlp/language_modeling/tuning/megatron_gpt_finetuning.py \
./NeMo-2.0.0.rc0.beta/examples/nlp/language_modeling/tuning/megatron_gpt_finetuning.py \
trainer.precision=bf16 \
trainer.devices=8 \
trainer.num_nodes=1 \
......
import json
import random
from argparse import ArgumentParser
from pathlib import Path
def main(path_to_data):
root = Path(path_to_data)
input_file = root / "databricks-dolly-15k-output.jsonl"
training_output_file = root / "training.jsonl"
validation_output_file = root / "validation.jsonl"
test_output_file = root / "test.jsonl"
# Specify the proportion of data for training and validation
train_proportion = 0.80
validation_proportion = 0.15
test_proportion = 0.05
# Read the JSONL file and shuffle the JSON objects
with open(input_file, "r") as f:
lines = f.readlines()
random.shuffle(lines)
# Calculate split indices
total_lines = len(lines)
train_index = int(total_lines * train_proportion)
val_index = int(total_lines * validation_proportion)
# Distribute JSON objects into training and validation sets
train_data = lines[:train_index]
validation_data = lines[train_index:train_index+val_index]
test_data = lines[train_index+val_index:]
# Write JSON objects to training file
with open(training_output_file, "w") as f:
for line in train_data:
f.write(line.strip() + "\n")
# Write JSON objects to validation file
with open(validation_output_file, "w") as f:
for line in validation_data:
f.write(line.strip() + "\n")
# Write JSON objects to training file
with open(test_output_file, "w") as f:
for line in test_data:
f.write(line.strip() + "\n")
def get_args():
parser = ArgumentParser()
parser.add_argument(
"--input",
type=str,
required=True,
help="Path to jsonl dataset you want to prepare.",
)
args = parser.parse_args()
return args
if __name__ == "__main__":
args = get_args()
path_to_data = args.input
main(path_to_data)
\ No newline at end of file
# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Dolly data preprocessing.
Example usage:
python preprocess.py --input=<path/to/data/file>
"""
import json
from argparse import ArgumentParser
import numpy as np
def to_jsonl(path_to_data):
print(f"Preprocessing data to jsonl format...")
output_path = f"{path_to_data.split('.')[0]}-output.jsonl"
with open(path_to_data, "r") as f, open(output_path, "w") as g:
for line in f:
line = json.loads(line)
context = line["context"].strip()
if context != "":
# Randomize context and instruction order.
context_first = np.random.randint(0, 2) == 0
if context_first:
instruction = line["instruction"].strip()
assert instruction != ""
input = f"{context}\n\n{instruction}"
output = line["response"]
else:
instruction = line["instruction"].strip()
assert instruction != ""
input = f"{instruction}\n\n{context}"
output = line["response"]
else:
input = line["instruction"]
output = line["response"]
g.write(
json.dumps(
{"input": input, "output": output, "category": line["category"]}
)
+ "\n"
)
print(f"Data was successfully preprocessed and saved by {output_path} .")
def get_args():
parser = ArgumentParser()
parser.add_argument(
"--input",
type=str,
required=True,
help="Path to jsonl dataset you want to prepare.",
)
args = parser.parse_args()
return args
def main():
args = get_args()
path_to_data = args.input
to_jsonl(path_to_data)
if __name__ == "__main__":
main()
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment