Unverified Commit 69c9b89f authored by Phuc Van Phan's avatar Phuc Van Phan Committed by GitHub
Browse files

docs: add docs for map, and add num procs to load_dataset (#27520)

parent 85fde09c
...@@ -439,6 +439,7 @@ def main(): ...@@ -439,6 +439,7 @@ def main():
data_args.dataset_config_name, data_args.dataset_config_name,
split=data_args.train_split_name, split=data_args.train_split_name,
cache_dir=data_args.dataset_cache_dir, cache_dir=data_args.dataset_cache_dir,
num_proc=data_args.preprocessing_num_workers,
token=True if model_args.use_auth_token else None, token=True if model_args.use_auth_token else None,
) )
...@@ -448,6 +449,7 @@ def main(): ...@@ -448,6 +449,7 @@ def main():
data_args.dataset_config_name, data_args.dataset_config_name,
split=data_args.eval_split_name, split=data_args.eval_split_name,
cache_dir=data_args.dataset_cache_dir, cache_dir=data_args.dataset_cache_dir,
num_proc=data_args.preprocessing_num_workers,
token=True if model_args.use_auth_token else None, token=True if model_args.use_auth_token else None,
) )
...@@ -551,7 +553,7 @@ def main(): ...@@ -551,7 +553,7 @@ def main():
prepare_dataset, prepare_dataset,
remove_columns=next(iter(raw_datasets.values())).column_names, remove_columns=next(iter(raw_datasets.values())).column_names,
num_proc=num_workers, num_proc=num_workers,
desc="preprocess train dataset", desc="preprocess train and eval dataset",
) )
# filter training data with inputs longer than max_input_length # filter training data with inputs longer than max_input_length
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment