Fixed based on review recoemmendation

43c9137b · Mostofa Patwary · 661553f6 · 43c9137b · 43c9137b · 43c9137b
Commit 43c9137b authored Mar 18, 2021 by Mostofa Patwary
5 changed files
--- a/megatron/arguments.py
+++ b/megatron/arguments.py
@@ -636,10 +636,6 @@ def _add_data_args(parser):
                       '1) a single data path, 2) multiple datasets in the'
                       'form: dataset1-weight dataset1-path dataset2-weight '
                       'dataset2-path ...')
-    group.add_argument('--qa-data-dev', type=str, default=None,
-                       help='Path to the QA dataset dev file.')
-    group.add_argument('--qa-data-test', type=str, default=None,
-                       help='Path to the QA dataset test file.')
    group.add_argument('--split', type=str, default='969, 30, 1',
                       help='Comma-separated list of proportions for training,'
                       ' validation, and test split. For example the split '
@@ -743,18 +739,11 @@ def _add_biencoder_args(parser):
                        'square root of hidden size')

    # faiss index
-    group.add_argument('--faiss-use-gpu', action='store_true',
-                       help='Whether create the FaissMIPSIndex on GPU')
    group.add_argument('--block-data-path', type=str, default=None,
                       help='Where to save/load BlockData to/from')
    group.add_argument('--embedding-path', type=str, default=None,
                       help='Where to save/load Open-Retrieval Embedding'
                        ' data to/from')
-    group.add_argument('--faiss-match', type=str, default='string', \
-                        choices=['regex', 'string'], help="Answer matching '\
-                        'logic type")
-    group.add_argument('--faiss-topk-retrievals', type=int, default=100,
-                       help='Number of blocks to use as top-k during retrieval')

    # indexer
    group.add_argument('--indexer-batch-size', type=int, default=128,

--- a/tasks/main.py
+++ b/tasks/main.py
@@ -47,6 +47,20 @@ def get_tasks_args(parser):
                       help='Sliding window for overlapping evaluation.')
    group.add_argument('--strict-lambada', action='store_true',
                       help='Use more difficult formulation of lambada.')
+    # Retriever args
+    group.add_argument('--qa-data-dev', type=str, default=None,
+                       help='Path to the QA dataset dev file.')
+    group.add_argument('--qa-data-test', type=str, default=None,
+                       help='Path to the QA dataset test file.')
+
+    # Faiss arguments for retriever
+    group.add_argument('--faiss-use-gpu', action='store_true',
+                       help='Whether create the FaissMIPSIndex on GPU')
+    group.add_argument('--faiss-match', type=str, default='string', \
+                        choices=['regex', 'string'], help="Answer matching '\
+                        'logic type")
+    group.add_argument('--faiss-topk-retrievals', type=int, default=100,
+                       help='Number of blocks to use as top-k during retrieval')

    return parser


--- a/tasks/orqa/evaluate_orqa.py
+++ b/tasks/orqa/evaluate_orqa.py
@@ -19,8 +19,6 @@ import os
 import sys

 from megatron import get_args
-from megatron.initialize import initialize_megatron
-
 from tasks.orqa.evaluate_utils import ORQAEvaluator

 def main():

--- a/tasks/orqa/natural_questions/qa_utils.py
+++ b/tasks/orqa/natural_questions/qa_utils.py
@@ -2,8 +2,11 @@
 # Copyright (c) Facebook, Inc. and its affiliates.
 # All rights reserved.
 #
-# This source code is licensed under the license found in the
-# LICENSE file in the root directory of this source tree.
+
+# The following code has been taken from
+# https://github.com/facebookresearch/DPR, which is CC-BY-NC 4.0
+# licensed as of now. More details on the license can be found
+# at https://github.com/facebookresearch/DPR/blob/master/LICENSE

 """
 Set of utilities for Q&A results validation tasks - Retriver passage

--- a/tasks/orqa/natural_questions/tokenizers.py
+++ b/tasks/orqa/natural_questions/tokenizers.py
@@ -2,9 +2,11 @@
 # Copyright (c) Facebook, Inc. and its affiliates.
 # All rights reserved.
 #
-# This source code is licensed under the license found in the
-# LICENSE file in the root directory of this source tree.

+# The following code has been taken from
+# https://github.com/facebookresearch/DPR, which is CC-BY-NC 4.0
+# licensed as of now. More details on the license can be found
+# at https://github.com/facebookresearch/DPR/blob/master/LICENSE

 """
 Most of the tokenizers code here is copied from DrQA codebase to avoid adding extra dependency