pretrain_blend.yaml 849 Bytes
Newer Older
xingjinliang's avatar
xingjinliang committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
__module__: megatron.energon
__class__: Metadataset
splits:
  train:
    datasets:
      - weight: 0.579   # Datasets are weighted according to their size. Weights sum up to 1.
        path: <path to laion dataset>
        subflavors:
          augmentation: False

      - weight: 0.02
        path: <path to coco>
        subflavors:
          augmentation: False

      - weight: 0.01
        path: <path to vqav2 dataset>
        subflavors:
          augmentation: False

      # Please refer to Table 4 in https://arxiv.org/pdf/2409.11402 for full list of pretrain datasets.
      # Please refer to https://nvidia.github.io/Megatron-Energon/data_prep.html on preparing datasets in the Megatron Energon format.
  val:
    datasets:
      - weight: 1.
        path: <path to validation dataset>
        subflavors:
          augmentation: False