"tests/pipelines/latte/test_latte.py" did not exist on "673eb60f1c4d971e1a577bed767053e50578b461"
Commit 71e79847 authored by chenzk's avatar chenzk
Browse files

v1.0.3

parents
Pipeline #2034 canceled with stages
"""
To process HuggingFace Datasets:
python3 tools/preprocess_data.py --tokenizer-name-or-path meta-llama/Meta-Llama-3-8B --output-folder datasets/emotion --n-tasks 16 hf --dataset dair-ai/emotion
To process Jsonl files:
python3 tools/preprocess_data.py --tokenizer-name-or-path meta-llama/Meta-Llama-3-8B --output-folder datasets/c4-es --n-tasks 16 jsonl --dataset raw_datasets/c4-es-json-files
"""
"""
bug solve: Exception: Is a directory (os error 21)
vim /usr/local/lib/python3.10/site-packages/datatrove/utils/tokenization.py , line 19
modify:
# return Tokenizer.from_file(name_or_path)
return Tokenizer.from_file(name_or_path + "/tokenizer.json")
"""
import argparse
from datatrove.executor.local import LocalPipelineExecutor
from datatrove.pipeline.readers import HuggingFaceDatasetReader, JsonlReader
from datatrove.pipeline.tokens import DocumentTokenizer
def get_args():
parser = argparse.ArgumentParser()
group = parser.add_argument_group(title="Tokenizer")
group.add_argument(
"--tokenizer-name-or-path",
type=str,
required=True,
help="A path to a directory containing vocabulary files required by the tokenizer or the model id of a predefined tokenizer hosted inside a model repo on the Hugging Face Hub.",
)
group.add_argument(
"--eos-token",
type=str,
default=None,
help="EOS token to add after each document. Default: None",
)
group = parser.add_argument_group(title="Output data")
group.add_argument(
"--output-folder", type=str, required=True, help="Path to the output folder to store the tokenized documents"
)
group = parser.add_argument_group(title="Miscellaneous configs")
group.add_argument(
"--logging-dir",
type=str,
default=None,
help="Path to a folder for storing the logs of the preprocessing step. Default: None",
)
group.add_argument(
"--n-tasks", type=int, default=8, help="Total number of tasks to run the preprocessing step. Default: 8"
)
# Subparsers for processing either Hugging Face datasets or jsonl files
sp = parser.add_subparsers(
dest="readers",
required=True,
description="Type of dataset to process. It can be either a Hugging Face Dataset loaded with datasets.load_data ('hf') or a .jsonl dataset ('jsonl')",
)
p1 = sp.add_parser(name="hf")
p1.add_argument(
"--dataset",
type=str,
required=True,
help="Path to local stored dataset or repository on the Hugging Face hub that can be loaded with datasets.load_dataset",
)
p1.add_argument("--column", type=str, default="text", help="Column to preprocess from the Dataset. Default: text")
p1.add_argument("--split", type=str, default="train", help="Which split of the data to process. Default: train")
p2 = sp.add_parser(name="jsonl")
p2.add_argument(
"--dataset",
type=str,
required=True,
help="Path to a .jsonl file or a folder containing multiple .jsonl files",
)
p2.add_argument("--column", type=str, default="text", help="Column to preprocess from the Dataset. Default: text")
p2.add_argument(
"--glob-pattern", type=str, default=None, help="A glob pattern to filter files to read. Default: None"
)
args = parser.parse_args()
return args
def main(args):
# Build datatrove reader
if args.readers == "hf":
datatrove_reader = HuggingFaceDatasetReader(
dataset=args.dataset,
text_key=args.column,
dataset_options={"split": args.split},
)
else:
datatrove_reader = JsonlReader(data_folder=args.dataset, text_key=args.column, glob_pattern=args.glob_pattern)
preprocess_executor = LocalPipelineExecutor(
pipeline=[
datatrove_reader,
DocumentTokenizer(
output_folder=args.output_folder,
tokenizer_name_or_path=args.tokenizer_name_or_path,
eos_token=args.eos_token,
shuffle=False,
max_tokens_per_file=1e9,
),
],
tasks=args.n_tasks,
logging_dir=args.logging_dir,
)
preprocess_executor.run()
if __name__ == "__main__":
_args = get_args()
main(_args)
# --nproc_per_node=8:dp=2, pp=2, and tp=2
# --nproc_per_node=4:dp=1, pp=2, and tp=2
# --nproc_per_node=1:dp=1, pp=1, and tp=1
CUDA_DEVICE_MAX_CONNECTIONS=1 torchrun --nproc_per_node=4 run_train.py --config-file examples/config_tiny_llama.yaml
# CUDA_DEVICE_MAX_CONNECTIONS=1 torchrun --nproc_per_node=4 run_train.py --config-file examples/config_tiny_llama_cosmo2tokenizer.yaml
# CUDA_DEVICE_MAX_CONNECTIONS=1 torchrun --nproc_per_node=4 run_train.py --config-file examples/config_llama3_dummytokenizer.yaml
# CUDA_DEVICE_MAX_CONNECTIONS=1 torchrun --nproc_per_node=4 run_train.py --config-file examples/config_llama3.yaml
# CUDA_DEVICE_MAX_CONNECTIONS=1 torchrun --nproc_per_node=1 run_train.py --config-file smollm1/config_smollm1_135M_demo1.yaml
CUDA_DEVICE_MAX_CONNECTIONS=1 torchrun --nproc_per_node=1 run_train.py --config-file smollm1/config_smollm1_135M_demo2.yaml
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment