Debugging done on Circe

b8bb0b49 · Mohammad · e3c57c82 · b8bb0b49 · b8bb0b49 · b8bb0b49
Commit b8bb0b49 authored Jun 15, 2020 by Mohammad
4 changed files
--- a/megatron/arguments.py
+++ b/megatron/arguments.py
@@ -391,6 +391,7 @@ def _add_data_args(parser):
    group.add_argument('--faiss-use-gpu', action='store_true')
    group.add_argument('--index-reload-interval', type=int, default=500)
    group.add_argument('--use-regular-masking', action='store_true')
+    group.add_argument('--use-random-spans', action='store_true')
    group.add_argument('--allow-trivial-doc', action='store_true')
    group.add_argument('--ner-data-path', type=str, default=None)

--- a/megatron/data/realm_dataset_utils.py
+++ b/megatron/data/realm_dataset_utils.py
@@ -28,6 +28,9 @@ def build_realm_training_sample(sample, max_seq_length,
            cls_id, sep_id, mask_id, max_predictions_per_seq, np_rng)
    elif block_ner_mask is not None:
        block_ner_mask = list(itertools.chain(*block_ner_mask))[:max_seq_length - 2]
+        if args.use_random_spans:
+            rand_idx = np.random.randint(len(block_ner_mask))
+            block_ner_mask = block_ner_mask[rand_idx:] + block_ner_mask[:rand_idx]
        block_ner_mask = [0] + block_ner_mask + [0]
        masked_tokens, masked_positions, masked_labels = get_arrays_using_ner_mask(tokens, block_ner_mask, mask_id)
    else:
@@ -182,7 +185,7 @@ def get_block_samples_mapping(block_dataset, title_dataset, data_prefix, num_epo
    indexmap_filename += '.npy'
    # Build the indexed mapping if not exist.
-    if torch.distributed.get_rank() == 0 and \
+    if mpu.get_data_parallel_rank() == 0 and \
            not os.path.isfile(indexmap_filename):
        print(' > WARNING: could not find index map file {}, building '
              'the indices on rank 0 ...'.format(indexmap_filename))

--- a/megatron/data/realm_index.py
+++ b/megatron/data/realm_index.py
@@ -15,12 +15,16 @@ def detach(tensor):
 class BlockData(object):
-    def __init__(self):
+    def __init__(self, block_data_path=None):
-        args = get_args()
        self.embed_data = dict()
        self.meta_data = dict()
-        block_data_path = os.path.splitext(args.block_data_path)[0]
+        if block_data_path is None:
-        self.temp_dir_name = block_data_path + '_tmp'
+            args = get_args()
+            block_data_path = args.block_data_path
+        self.block_data_path = block_data_path
+        block_data_name = os.path.splitext(self.block_data_path)[0]
+        self.temp_dir_name = block_data_name + '_tmp'
    def state(self):
        return {
@@ -54,7 +58,7 @@ class BlockData(object):
    def save_shard(self, rank):
        if not os.path.isdir(self.temp_dir_name):
-            os.mkdir(self.temp_dir_name)
+            os.makedirs(self.temp_dir_name, exist_ok=True)
        # save the data for each shard
        with open('{}/{}.pkl'.format(self.temp_dir_name, rank), 'wb') as data_file:
@@ -73,8 +77,7 @@ class BlockData(object):
                self.meta_data.update(data['meta_data'])
                # assert (len(self.embed_data) == old_size + shard_size) or (str(ignore_shard) in fname)
-        args = get_args()
+        with open(self.block_data_path, 'wb') as final_file:
-        with open(args.block_data_path, 'wb') as final_file:
            pickle.dump(self.state(), final_file)
        shutil.rmtree(self.temp_dir_name, ignore_errors=True)

--- a/megatron/training.py
+++ b/megatron/training.py
@@ -422,8 +422,9 @@ def train(forward_step_func, model, optimizer, lr_scheduler,
        elif iteration < 20:
-            print("moving right along", flush=True)
+            #print("moving right along", flush=True)
-            report_memory("iteration {}".format(iteration))
+            #report_memory("iteration {}".format(iteration))
+            pass
        loss_dict, skipped_iter = train_step(forward_step_func,
                                             train_data_iterator,
                                             model,