Replace stale `triviaqa` dataset link

299e9505 · jon-tow · 62ca1840 · 299e9505 · 299e9505
Commit 299e9505 authored Dec 12, 2022 by jon-tow
Hide whitespace changes
Inline Side-by-side

Showing with 46 additions and 6 deletions

lm_eval/datasets/triviaqa/README.md lm_eval/datasets/triviaqa/README.md +40 -0

lm_eval/datasets/triviaqa/triviaqa.py lm_eval/datasets/triviaqa/triviaqa.py +6 -6

No files found.
--- a/lm_eval/datasets/triviaqa/README.md
+++ b/lm_eval/datasets/triviaqa/README.md
+---
+dataset_info:
+  features:
+  - name: question_id
+    dtype: string
+  - name: question_source
+    dtype: string
+  - name: question
+    dtype: string
+  - name: answer
+    struct:
+    - name: aliases
+      sequence: string
+    - name: value
+      dtype: string
+  - name: search_results
+    sequence:
+    - name: description
+      dtype: string
+    - name: filename
+      dtype: string
+    - name: rank
+      dtype: int32
+    - name: title
+      dtype: string
+    - name: url
+      dtype: string
+    - name: search_context
+      dtype: string
+  config_name: triviaqa
+  splits:
+  - name: train
+    num_bytes: 1270894387
+    num_examples: 87622
+  - name: validation
+    num_bytes: 163755044
+    num_examples: 11313
+  download_size: 632549060
+  dataset_size: 1434649431
+---
--- a/lm_eval/datasets/triviaqa/triviaqa.py
+++ b/lm_eval/datasets/triviaqa/triviaqa.py
@@ -46,13 +46,13 @@ _HOMEPAGE = "https://nlp.cs.washington.edu/triviaqa/"

 _LICENSE = "Apache License 2.0"

-_URLS = "http://eaidata.bmk.sh/data/triviaqa-unfiltered.tar.gz"
+_URLS = "https://nlp.cs.washington.edu/triviaqa/data/triviaqa-unfiltered.tar.gz"


 class Triviaqa(datasets.GeneratorBasedBuilder):
    """TriviaQA is a reading comprehension dataset containing over 650K question-answer-evidence triples"""

-    VERSION = datasets.Version("0.0.1")
+    VERSION = datasets.Version("0.0.2")

    BUILDER_CONFIGS = [
        datasets.BuilderConfig(
@@ -100,14 +100,14 @@ class Triviaqa(datasets.GeneratorBasedBuilder):
                name=datasets.Split.TRAIN,
                # These kwargs will be passed to _generate_examples
                gen_kwargs={
-                    "filepath": os.path.join(data_dir, "unfiltered-web-train.jsonl"),
+                    "filepath": os.path.join(data_dir, "triviaqa-unfiltered", "unfiltered-web-train.json"),
                },
            ),
            datasets.SplitGenerator(
                name=datasets.Split.VALIDATION,
                # These kwargs will be passed to _generate_examples
                gen_kwargs={
-                    "filepath": os.path.join(data_dir, "unfiltered-web-dev.jsonl"),
+                    "filepath": os.path.join(data_dir, "triviaqa-unfiltered", "unfiltered-web-dev.json"),
                },
            ),
        ]
@@ -115,8 +115,8 @@ class Triviaqa(datasets.GeneratorBasedBuilder):
    # method parameters are unpacked from `gen_kwargs` as given in `_split_generators`
    def _generate_examples(self, filepath):
        with open(filepath, encoding="utf-8") as f:
-            for key, row in enumerate(f):
-                data = json.loads(row)
+            json_data = json.load(f)['Data']
+            for key, data in enumerate(json_data):
                search_results = []
                for search_result in data["SearchResults"]:
                    search_results.append(