task: c4 dataset_path: allenai/c4 dataset_name: en output_type: loglikelihood_rolling training_split: train validation_split: validation doc_to_text: "" doc_to_target: !function preprocess_c4.c4_detokenizer process_results: !function preprocess_c4.process_results should_decontaminate: true doc_to_decontamination_query: "{{page}}" metric_list: - metric: word_perplexity - metric: byte_perplexity - metric: bits_per_byte metadata: version: 0.0 dataset_kwargs: data_files: train: en/c4-train.00000-of-01024.json.gz validation: en/c4-validation.00000-of-00008.json.gz # following the choice of https://arxiv.org/abs/2410.07461 trust_remote_code: true verification_mode: "no_checks"