moved compile helper to initialize

242770dd · mshoeybi · a495871b · 242770dd · 242770dd · 242770dd
Commit 242770dd authored Dec 29, 2020 by mshoeybi
5 changed files
--- a/megatron/data/bert_dataset.py
+++ b/megatron/data/bert_dataset.py
@@ -128,8 +128,6 @@ def get_samples_mapping_(indexed_dataset,
        print_rank_0(' > building sapmles index mapping for {} ...'.format(
            name))
        # First compile and then import.
-        from megatron.data.dataset_utils import compile_helper
-        compile_helper()
        from megatron.data import helpers
        samples_mapping = helpers.build_mapping(
            indexed_dataset.doc_idx,

--- a/megatron/data/blendable_dataset.py
+++ b/megatron/data/blendable_dataset.py
@@ -49,13 +49,6 @@ class BlendableDataset(torch.utils.data.Dataset):
        self.dataset_index = np.zeros(self.size, dtype=np.uint8)
        self.dataset_sample_index = np.zeros(self.size, dtype=np.int64)

-        if torch.distributed.get_rank() == 0:
-            from megatron.data.dataset_utils import compile_helper
-            compile_helper()
-        # Simple barrier
-        tmp = torch.cuda.LongTensor([1])
-        torch.distributed.all_reduce(tmp, group=mpu.get_data_parallel_group())
-
        from megatron.data import helpers
        helpers.build_blending_indices(self.dataset_index,
                                       self.dataset_sample_index,

--- a/megatron/data/gpt2_dataset.py
+++ b/megatron/data/gpt2_dataset.py
@@ -269,8 +269,6 @@ def _build_index_mappings(name, data_prefix, documents, sizes,
            start_time = time.time()
            # Use C++ implementation for speed.
            # First compile and then import.
-            from megatron.data.dataset_utils import compile_helper
-            compile_helper()
            from megatron.data import helpers
            assert doc_idx.dtype == np.int32
            assert sizes.dtype == np.int32

--- a/megatron/data/realm_dataset_utils.py
+++ b/megatron/data/realm_dataset_utils.py
@@ -152,10 +152,6 @@ def get_block_samples_mapping(block_dataset, title_dataset, data_prefix, num_epo
        print_rank_0(' > building samples index mapping for {} ...'.format(
            name))

-        # compile/bind the C++ helper code
-        from megatron.data.dataset_utils import compile_helper
-        compile_helper()
-
        from megatron.data import helpers
        mapping_array = helpers.build_blocks_mapping(
            block_dataset.doc_idx,

--- a/megatron/initialize.py
+++ b/megatron/initialize.py
@@ -78,6 +78,16 @@ def initialize_megatron(extra_args_provider=None, args_defaults={},
        
        # Autoresume.
        _init_autoresume()
+
+        # Compile dataset C++ code.
+        try:
+            from megatron.data import helpers
+        except:
+            if torch.distributed.get_rank() == 0:
+                from megatron.data.dataset_utils import compile_helper
+                compile_helper()
+            # Simple barrier
+            torch.distributed.barrier()
        
        # No continuation function
        return None