v1.0

e4575be9 · huaerkl · e4575be9 · e4575be9 · e4575be9 · e4575be9
Commit e4575be9 authored Aug 04, 2023 by huaerkl
20 changed files
--- a/megatron/data/biencoder_dataset_utils.py
+++ b/megatron/data/biencoder_dataset_utils.py
--- a/megatron/data/blendable_dataset.py
+++ b/megatron/data/blendable_dataset.py
+# coding=utf-8
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Blendable dataset."""
+import time
+import numpy as np
+import torch
+from megatron import print_rank_0
+from megatron import mpu
+class BlendableDataset(torch.utils.data.Dataset):
+    def __init__(self, datasets, weights):
+        self.datasets = datasets
+        num_datasets = len(datasets)
+        assert num_datasets == len(weights)
+        self.size = 0
+        for dataset in self.datasets:
+            self.size += len(dataset)
+        # Normalize weights.
+        weights = np.array(weights, dtype=np.float64)
+        sum_weights = np.sum(weights)
+        assert sum_weights > 0.0
+        weights /= sum_weights
+        # Build indecies.
+        start_time = time.time()
+        assert num_datasets < 255
+        self.dataset_index = np.zeros(self.size, dtype=np.uint8)
+        self.dataset_sample_index = np.zeros(self.size, dtype=np.int64)
+        from megatron.data import helpers
+        helpers.build_blending_indices(self.dataset_index,
+                                       self.dataset_sample_index,
+                                       weights, num_datasets, self.size,
+                                       torch.distributed.get_rank() == 0)
+        print_rank_0('> elapsed time for building blendable dataset indices: '
+                     '{:.2f} (sec)'.format(time.time() - start_time))
+    def __len__(self):
+        return self.size
+    def __getitem__(self, idx):
+        dataset_idx = self.dataset_index[idx]
+        sample_idx = self.dataset_sample_index[idx]
+        return self.datasets[dataset_idx][sample_idx]
--- a/megatron/data/data_samplers.py
+++ b/megatron/data/data_samplers.py
--- a/megatron/data/dataset_utils.py
+++ b/megatron/data/dataset_utils.py
--- a/megatron/data/decoder_packed_mtf_dataset.py
+++ b/megatron/data/decoder_packed_mtf_dataset.py
--- a/megatron/data/distdata.py
+++ b/megatron/data/distdata.py
--- a/megatron/data/gpt_dataset.py
+++ b/megatron/data/gpt_dataset.py
--- a/megatron/data/helpers.cpp
+++ b/megatron/data/helpers.cpp
--- a/megatron/data/ict_dataset.py
+++ b/megatron/data/ict_dataset.py
--- a/megatron/data/indexed_dataset.py
+++ b/megatron/data/indexed_dataset.py
--- a/megatron/data/mlm_dataset.py
+++ b/megatron/data/mlm_dataset.py
--- a/megatron/data/mtf_dataset.py
+++ b/megatron/data/mtf_dataset.py
+# coding=utf-8
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Multitask Finetune style dataset."""
+import time
+import numpy as np
+import torch
+from megatron import print_rank_0
+from megatron.data.indexed_dataset import make_dataset as make_indexed_dataset
+class MTFDataset(torch.utils.data.Dataset):
+    def __init__(
+        self,
+        name,
+        data_prefix,
+        data_impl,
+        skip_warmup,
+        documents,
+    ):
+        # Params to store.
+        self.name = name
+        # Dataset.
+        self.input_indexed_dataset = get_indexed_dataset(data_prefix, is_input=True, data_impl=data_impl, skip_warmup=skip_warmup)
+        self.target_indexed_dataset = get_indexed_dataset(data_prefix, is_input=False, data_impl=data_impl, skip_warmup=skip_warmup)
+        # Checks
+        assert np.min(documents) >= 0
+        assert np.max(documents) < self.input_indexed_dataset.sizes.shape[0]
+        assert np.max(documents) < self.target_indexed_dataset.sizes.shape[0]
+        assert self.input_indexed_dataset.sizes.shape[0] == self.target_indexed_dataset.sizes.shape[0]
+    def __len__(self):
+        return len(self.input_indexed_dataset)
+    def __getitem__(self, idx):
+        input_tokens = self.input_indexed_dataset.get(idx)
+        target_tokens = self.target_indexed_dataset.get(idx)
+        assert len(input_tokens) > 0
+        assert len(target_tokens) > 0
+        return {
+            'input_tokens': input_tokens,
+            'target_tokens': target_tokens,
+        }
+    def size(self, index):
+        return {
+            'input_tokens': self.input_indexed_dataset.size(index),
+            'target_tokens': self.target_indexed_dataset.size(index),
+        }
+def get_indexed_dataset(data_prefix: str, is_input: bool, data_impl: str, skip_warmup: bool):
+    if is_input:
+        field = "inputs"
+    else:
+        field = "targets"
+    return get_indexed_dataset_(f"{data_prefix}_{field}_document", data_impl, skip_warmup)
+def get_indexed_dataset_(path, data_impl, skip_warmup):
+    """Build indexed dataset."""
+    print_rank_0(' > building dataset index ...')
+    start_time = time.time()
+    indexed_dataset = make_indexed_dataset(path,
+                                           data_impl,
+                                           skip_warmup)
+    print_rank_0(' > finished creating indexed dataset in {:4f} '
+                 'seconds'.format(time.time() - start_time))
+    print_rank_0('    number of documents: {}'.format(
+        indexed_dataset.sizes.shape[0]))
+    return indexed_dataset
--- a/megatron/data/orqa_wiki_dataset.py
+++ b/megatron/data/orqa_wiki_dataset.py
--- a/megatron/data/realm_dataset_utils.py
+++ b/megatron/data/realm_dataset_utils.py
--- a/megatron/data/realm_index.py
+++ b/megatron/data/realm_index.py
--- a/megatron/data/t5_dataset.py
+++ b/megatron/data/t5_dataset.py
--- a/megatron/data/test/test_indexed_dataset.py
+++ b/megatron/data/test/test_indexed_dataset.py
--- a/megatron/data/test/test_preprocess_data.sh
+++ b/megatron/data/test/test_preprocess_data.sh
+#!/bin/bash
+IMPL=cached
+python ../preprocess_data.py \
+       --input test_samples.json \
+       --vocab vocab.txt \
+       --dataset-impl ${IMPL} \
+       --output-prefix test_samples_${IMPL} \
+       --workers 1 \
+       --log-interval 2
--- a/megatron/data/vit_dataset.py
+++ b/megatron/data/vit_dataset.py
--- a/megatron/enums.py
+++ b/megatron/enums.py