添加dolly-15k数据集处理脚本

541376a8 · wxj · 0651a856 · 541376a8 · 541376a8 · 541376a8
Commit 541376a8 authored Jan 08, 2025 by wxj
3 changed files
--- a/K100AI_finetune.sh
+++ b/K100AI_finetune.sh
@@ -35,11 +35,11 @@ VALID_NAMES="[databricks-dolly-15k]"
 # CONCAT_SAMPLING_PROBS="[0.3,0.7]" # "[1]" # 只有一个数据集设置为1
 CONCAT_SAMPLING_PROBS="[1]"

-# 可能需要导入环境变量
+# 可能需要导入的环境变量
 export LD_PRELOAD=/usr/local/lib/python3.10/site-packages/transformer_engine.libs/libgalaxyhip-8e217ef3.so.5.2.24472.1059-0a6afed7
 # 运行训练脚本
 torchrun --nproc_per_node 8 \
-   /workspace/nemo_main/NeMo-2.0.0.rc0.beta/examples/nlp/language_modeling/tuning/megatron_gpt_finetuning.py \
+   ./NeMo-2.0.0.rc0.beta/examples/nlp/language_modeling/tuning/megatron_gpt_finetuning.py \
   trainer.precision=bf16 \
   trainer.devices=8 \
   trainer.num_nodes=1 \

--- a/NeMo-2.0.0.rc0.beta/scripts/dataset_processing/nlp/dolly_dataprep/dolly_dataspilt.py
+++ b/NeMo-2.0.0.rc0.beta/scripts/dataset_processing/nlp/dolly_dataprep/dolly_dataspilt.py
+import json
+import random
+
+from argparse import ArgumentParser
+from pathlib import Path
+
+def main(path_to_data):
+    root = Path(path_to_data)
+    input_file = root / "databricks-dolly-15k-output.jsonl"
+    training_output_file = root / "training.jsonl"
+    validation_output_file = root / "validation.jsonl"
+    test_output_file = root / "test.jsonl"
+
+    # Specify the proportion of data for training and validation
+    train_proportion = 0.80
+    validation_proportion = 0.15
+    test_proportion = 0.05
+
+    # Read the JSONL file and shuffle the JSON objects
+    with open(input_file, "r") as f:
+        lines = f.readlines()
+        random.shuffle(lines)
+
+    # Calculate split indices
+    total_lines = len(lines)
+    train_index = int(total_lines * train_proportion)
+    val_index = int(total_lines * validation_proportion)
+
+    # Distribute JSON objects into training and validation sets
+    train_data = lines[:train_index]
+    validation_data = lines[train_index:train_index+val_index]
+    test_data = lines[train_index+val_index:]
+
+    # Write JSON objects to training file
+    with open(training_output_file, "w") as f:
+        for line in train_data:
+            f.write(line.strip() + "\n")
+
+    # Write JSON objects to validation file
+    with open(validation_output_file, "w") as f:
+        for line in validation_data:
+            f.write(line.strip() + "\n")
+
+    # Write JSON objects to training file
+    with open(test_output_file, "w") as f:
+        for line in test_data:
+            f.write(line.strip() + "\n")
+
+def get_args():
+    parser = ArgumentParser()
+    parser.add_argument(
+        "--input",
+        type=str,
+        required=True,
+        help="Path to jsonl dataset you want to prepare.",
+    )
+    args = parser.parse_args()
+
+    return args
+
+if __name__ == "__main__":
+    args = get_args()
+    path_to_data = args.input
+    main(path_to_data)
\ No newline at end of file
--- a/NeMo-2.0.0.rc0.beta/scripts/dataset_processing/nlp/dolly_dataprep/preprocess.py
+++ b/NeMo-2.0.0.rc0.beta/scripts/dataset_processing/nlp/dolly_dataprep/preprocess.py
+# Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Dolly data preprocessing.
+Example usage:
+python preprocess.py --input=<path/to/data/file>
+"""
+
+import json
+from argparse import ArgumentParser
+
+import numpy as np
+
+
+def to_jsonl(path_to_data):
+    print(f"Preprocessing data to jsonl format...")
+    output_path = f"{path_to_data.split('.')[0]}-output.jsonl"
+    with open(path_to_data, "r") as f, open(output_path, "w") as g:
+        for line in f:
+            line = json.loads(line)
+            context = line["context"].strip()
+            if context != "":
+                # Randomize context and instruction order.
+                context_first = np.random.randint(0, 2) == 0
+                if context_first:
+                    instruction = line["instruction"].strip()
+                    assert instruction != ""
+                    input = f"{context}\n\n{instruction}"
+                    output = line["response"]
+                else:
+                    instruction = line["instruction"].strip()
+                    assert instruction != ""
+                    input = f"{instruction}\n\n{context}"
+                    output = line["response"]
+            else:
+                input = line["instruction"]
+                output = line["response"]
+            g.write(
+                json.dumps(
+                    {"input": input, "output": output, "category": line["category"]}
+                )
+                + "\n"
+            )
+    print(f"Data was successfully preprocessed and saved by {output_path} .")
+
+
+def get_args():
+    parser = ArgumentParser()
+    parser.add_argument(
+        "--input",
+        type=str,
+        required=True,
+        help="Path to jsonl dataset you want to prepare.",
+    )
+    args = parser.parse_args()
+
+    return args
+
+
+def main():
+    args = get_args()
+    path_to_data = args.input
+    to_jsonl(path_to_data)
+
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file