Merge branch 'main_retriver_merge' into main_retriver_merge_ict_eval

ebc95c35 · Mostofa Patwary · 360fb6b7 · a4b628ab · ebc95c35 · ebc95c35
Commit ebc95c35 authored Feb 17, 2021 by Mostofa Patwary
4 changed files
--- a/examples/pretrain_ict.sh
+++ b/examples/pretrain_ict.sh
@@ -27,7 +27,7 @@ python pretrain_ict.py \
        --log-interval 100 \
        --eval-interval 1000 \
        --eval-iters 10 \
-        --report-topk-accuracies 1 5 10 20 100 \
+        --retriever-report-topk-accuracies 1 5 10 20 100 \
        --retriever-score-scaling \
        --load $CHECKPOINT_PATH \
        --save $CHECKPOINT_PATH \

--- a/megatron/arguments.py
+++ b/megatron/arguments.py
@@ -649,16 +649,12 @@ def _add_biencoder_args(parser):
    group.add_argument('--ict-head-size', type=int, default=None,
                       help='Size of block embeddings to be used in ICT and '
                        'REALM (paper default: 128)')
-    group.add_argument('--projection-dim', type=int, default=0,
+    group.add_argument('--biencoder-projection-dim', type=int, default=0,
                       help='Size of projection head used in biencoder (paper'
                        ' default: 128)')
-    group.add_argument('--shared-query-context-model', action='store_true',
+    group.add_argument('--biencoder-shared-query-context-model', action='store_true',
                        help='Whether to share the parameters of the query '
                        'and context models or not')
-    group.add_argument('--pool-type', type=str, default='cls-token',
-                       choices=['avg', 'cls-token', 'max'],
-                       help='different options are: avg | cls-token | max, '
-                        'default=cls-token')
    # checkpointing
    group.add_argument('--ict-load', type=str, default=None,
@@ -677,7 +673,7 @@ def _add_biencoder_args(parser):
                       help='Whether to use one sentence documents in ICT')
    # training
-    group.add_argument('--report-topk-accuracies', nargs='+', type=int, 
+    group.add_argument('--retriever-report-topk-accuracies', nargs='+', type=int,
                        default=[], help="Which top-k accuracies to report "
                        "(e.g. '1 5 20')")
    group.add_argument('--retriever-score-scaling', action='store_true',

--- a/megatron/model/biencoder_model.py
+++ b/megatron/model/biencoder_model.py
@@ -17,7 +17,7 @@ from .module import MegatronModule
 def biencoder_model_provider(only_query_model=False,
                             only_context_model=False,
-                             shared_query_context_model=False):
+                             biencoder_shared_query_context_model=False):
    """Build the model."""
    args = get_args()
@@ -31,10 +31,11 @@ def biencoder_model_provider(only_query_model=False,
    # the LM we initialize with has 2 tokentypes
    model = BiEncoderModel(
        num_tokentypes=2,
-        parallel_output=True,
+        parallel_output=False,
        only_query_model=only_query_model,
        only_context_model=only_context_model,
-        shared_query_context_model=shared_query_context_model)
+        biencoder_shared_query_context_model=\
+            biencoder_shared_query_context_model)
    return model
@@ -47,7 +48,7 @@ class BiEncoderModel(MegatronModule):
                 parallel_output=True,
                 only_query_model=False,
                 only_context_model=False,
-                 shared_query_context_model=False):
+                 biencoder_shared_query_context_model=False):
        super(BiEncoderModel, self).__init__()
        args = get_args()
@@ -55,13 +56,14 @@ class BiEncoderModel(MegatronModule):
            num_tokentypes=num_tokentypes,
            parallel_output=parallel_output)
-        self.shared_query_context_model = shared_query_context_model
+        self.biencoder_shared_query_context_model = \
+            biencoder_shared_query_context_model
        assert not (only_context_model and only_query_model)
        self.use_context_model = not only_query_model
        self.use_query_model = not only_context_model
-        self.projection_dim = args.projection_dim
+        self.biencoder_projection_dim = args.biencoder_projection_dim
-        if self.shared_query_context_model:
+        if self.biencoder_shared_query_context_model:
            self.model = PretrainedBertModel(**bert_kwargs)
            self._model_key = 'shared_model'
            self.query_model, self.context_model = self.model, self.model
@@ -109,7 +111,7 @@ class BiEncoderModel(MegatronModule):
        prefix='', keep_vars=False):
        """Save dict with state dicts of each of the models."""
        state_dict_ = {}
-        if self.shared_query_context_model:
+        if self.biencoder_shared_query_context_model:
            state_dict_[self._model_key] = \
                self.model.state_dict_for_save_checkpoint(destination,
                                                          prefix,
@@ -129,7 +131,7 @@ class BiEncoderModel(MegatronModule):
    def load_state_dict(self, state_dict, strict=True):
        """Load the state dicts of each of the models"""
-        if self.shared_query_context_model:
+        if self.biencoder_shared_query_context_model:
            print_rank_0("Loading shared query-context model")
            self.model.load_state_dict(state_dict[self._model_key], \
                strict=strict)
@@ -188,14 +190,14 @@ class BiEncoderModel(MegatronModule):
        # load the LM state dict into each model
        model_dict = state_dict['model']['language_model']
-        if self.shared_query_context_model:
+        if self.biencoder_shared_query_context_model:
            self.model.language_model.load_state_dict(model_dict)
            fix_query_key_value_ordering(self.model, checkpoint_version)
        else:
            if self.use_query_model:
                self.query_model.language_model.load_state_dict(model_dict)
                # give each model the same ict_head to begin with as well
-                if self.projection_dim > 0:
+                if self.biencoder_projection_dim > 0:
                    query_proj_state_dict = \
                        self.state_dict_for_save_checkpoint()\
                        [self._query_key]['projection_enc']
@@ -203,7 +205,8 @@ class BiEncoderModel(MegatronModule):
            if self.use_context_model:
                self.context_model.language_model.load_state_dict(model_dict)
-                if self.query_model is not None and self.projection_dim > 0:
+                if self.query_model is not None and \
+                    self.biencoder_projection_dim > 0:
                    self.context_model.projection_enc.load_state_dict\
                        (query_proj_state_dict)
                fix_query_key_value_ordering(self.context_model, checkpoint_version)
@@ -220,8 +223,7 @@ class PretrainedBertModel(MegatronModule):
        args = get_args()
        tokenizer = get_tokenizer()
        self.pad_id = tokenizer.pad
-        self.pool_type = args.pool_type
+        self.biencoder_projection_dim = args.biencoder_projection_dim
-        self.projection_dim = args.projection_dim
        self.parallel_output = parallel_output
        init_method = init_method_normal(args.init_method_std)
        scaled_init_method = scaled_init_method_normal(
@@ -234,9 +236,9 @@ class PretrainedBertModel(MegatronModule):
            init_method=init_method,
            scaled_init_method=scaled_init_method)
-        if args.projection_dim > 0:
+        if args.biencoder_projection_dim > 0:
            self.projection_enc = get_linear_layer(args.hidden_size,
-                                                   args.projection_dim,
+                                                   args.biencoder_projection_dim,
                                                   init_method)
            self._projection_enc_key = 'projection_enc'
@@ -253,22 +255,14 @@ class PretrainedBertModel(MegatronModule):
        # This mask will be used in average-pooling and max-pooling
        pool_mask = (input_ids == self.pad_id).unsqueeze(2)
-         # Taking the representation of the [CLS] token of BERT
+        # Taking the representation of the [CLS] token of BERT
-        if self.pool_type == "cls-token":
+        pooled_output = lm_output[:, 0, :]
-            pooled_output = lm_output[:, 0, :]
-        elif self.pool_type == "avg":    # Average Pooling
-            pooled_output = lm_output.masked_fill(pool_mask, 0)
-            pooled_output = pooled_output.sum(1) / (pool_mask.size(1) \
-                - pool_mask.float().sum(1))
-        elif self.pool_type == "max":    # Max-Pooling
-            pooled_output = lm_output.masked_fill(pool_mask, -1000)
-            pooled_output = torch.max(pooled_output, 1)[0]
        # Converting to float16 dtype
        pooled_output = pooled_output.to(lm_output.dtype)
        # Output.
-        if self.projection_dim:
+        if self.biencoder_projection_dim:
            pooled_output = self.projection_enc(pooled_output)
        return pooled_output
@@ -283,7 +277,7 @@ class PretrainedBertModel(MegatronModule):
            = self.language_model.state_dict_for_save_checkpoint(
            destination, prefix, keep_vars)
-        if self.projection_dim > 0:
+        if self.biencoder_projection_dim > 0:
            state_dict_[self._projection_enc_key] = \
                self.projection_enc.state_dict(destination, prefix, keep_vars)
@@ -295,7 +289,7 @@ class PretrainedBertModel(MegatronModule):
        self.language_model.load_state_dict(
            state_dict[self._language_model_key], strict=strict)
-        if self.projection_dim > 0:
+        if self.biencoder_projection_dim > 0:
            print_rank_0("loading projection head weights")
            self.projection_enc.load_state_dict(
                state_dict[self._projection_enc_key], strict=strict)
--- a/pretrain_ict.py
+++ b/pretrain_ict.py
@@ -36,7 +36,8 @@ def pretrain_ict_model_provider():
    model = biencoder_model_provider(
                only_context_model=False,
                only_query_model=False,
-                shared_query_context_model=args.shared_query_context_model)
+                biencoder_shared_query_context_model=\
+                    args.biencoder_shared_query_context_model)
    return model
 def get_group_world_size_rank():
@@ -120,7 +121,7 @@ def forward_step(data_iterator, model, input_tensor):
        return torch.cuda.FloatTensor([sum([int(i in sorted_indices[i, :k]) \
            for i in range(global_batch_size)]) / global_batch_size])
-    topk_accs = [topk_accuracy(int(k)) for k in args.report_topk_accuracies]
+    topk_accs = [topk_accuracy(int(k)) for k in args.retriever_report_topk_accuracies]
    labels = torch.arange(global_batch_size).long().cuda()
    loss = F.nll_loss(softmax_scores, labels, reduction='mean')
@@ -131,7 +132,7 @@ def forward_step(data_iterator, model, input_tensor):
    # create stats_dict with retrieval loss and all specified top-k accuracies
    topk_acc_dict = {'top{}_acc'.format(k): v * 100 for k, v in \
-                        zip(args.report_topk_accuracies, reduced_losses[1:])}
+                        zip(args.retriever_report_topk_accuracies, reduced_losses[1:])}
    stats_dict = dict(loss=reduced_losses[0], **topk_acc_dict)
    return loss, stats_dict