c4.yaml 678 Bytes
Newer Older
Yufeng Xu's avatar
Yufeng Xu committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
task: c4
dataset_path: allenai/c4
dataset_name: en
output_type: loglikelihood_rolling
training_split: train
validation_split: validation
doc_to_text: ""
doc_to_target: !function preprocess_c4.c4_detokenizer
process_results: !function preprocess_c4.process_results
should_decontaminate: true
doc_to_decontamination_query: "{{page}}"
metric_list:
  - metric: word_perplexity
  - metric: byte_perplexity
  - metric: bits_per_byte
metadata:
  version: 0.0
dataset_kwargs:
  data_files:
    train: en/c4-train.00000-of-01024.json.gz
    validation: en/c4-validation.00000-of-00008.json.gz
  # following the choice of https://arxiv.org/abs/2410.07461
23
  verification_mode: "no_checks"