"vscode:/vscode.git/clone" did not exist on "e172f095ba4af2c98d7744ce4ffcf4cd3a8e123c"
Commit 299e9505 authored by jon-tow's avatar jon-tow
Browse files

Replace stale `triviaqa` dataset link

parent 62ca1840
---
dataset_info:
features:
- name: question_id
dtype: string
- name: question_source
dtype: string
- name: question
dtype: string
- name: answer
struct:
- name: aliases
sequence: string
- name: value
dtype: string
- name: search_results
sequence:
- name: description
dtype: string
- name: filename
dtype: string
- name: rank
dtype: int32
- name: title
dtype: string
- name: url
dtype: string
- name: search_context
dtype: string
config_name: triviaqa
splits:
- name: train
num_bytes: 1270894387
num_examples: 87622
- name: validation
num_bytes: 163755044
num_examples: 11313
download_size: 632549060
dataset_size: 1434649431
---
...@@ -46,13 +46,13 @@ _HOMEPAGE = "https://nlp.cs.washington.edu/triviaqa/" ...@@ -46,13 +46,13 @@ _HOMEPAGE = "https://nlp.cs.washington.edu/triviaqa/"
_LICENSE = "Apache License 2.0" _LICENSE = "Apache License 2.0"
_URLS = "http://eaidata.bmk.sh/data/triviaqa-unfiltered.tar.gz" _URLS = "https://nlp.cs.washington.edu/triviaqa/data/triviaqa-unfiltered.tar.gz"
class Triviaqa(datasets.GeneratorBasedBuilder): class Triviaqa(datasets.GeneratorBasedBuilder):
"""TriviaQA is a reading comprehension dataset containing over 650K question-answer-evidence triples""" """TriviaQA is a reading comprehension dataset containing over 650K question-answer-evidence triples"""
VERSION = datasets.Version("0.0.1") VERSION = datasets.Version("0.0.2")
BUILDER_CONFIGS = [ BUILDER_CONFIGS = [
datasets.BuilderConfig( datasets.BuilderConfig(
...@@ -100,14 +100,14 @@ class Triviaqa(datasets.GeneratorBasedBuilder): ...@@ -100,14 +100,14 @@ class Triviaqa(datasets.GeneratorBasedBuilder):
name=datasets.Split.TRAIN, name=datasets.Split.TRAIN,
# These kwargs will be passed to _generate_examples # These kwargs will be passed to _generate_examples
gen_kwargs={ gen_kwargs={
"filepath": os.path.join(data_dir, "unfiltered-web-train.jsonl"), "filepath": os.path.join(data_dir, "triviaqa-unfiltered", "unfiltered-web-train.json"),
}, },
), ),
datasets.SplitGenerator( datasets.SplitGenerator(
name=datasets.Split.VALIDATION, name=datasets.Split.VALIDATION,
# These kwargs will be passed to _generate_examples # These kwargs will be passed to _generate_examples
gen_kwargs={ gen_kwargs={
"filepath": os.path.join(data_dir, "unfiltered-web-dev.jsonl"), "filepath": os.path.join(data_dir, "triviaqa-unfiltered", "unfiltered-web-dev.json"),
}, },
), ),
] ]
...@@ -115,8 +115,8 @@ class Triviaqa(datasets.GeneratorBasedBuilder): ...@@ -115,8 +115,8 @@ class Triviaqa(datasets.GeneratorBasedBuilder):
# method parameters are unpacked from `gen_kwargs` as given in `_split_generators` # method parameters are unpacked from `gen_kwargs` as given in `_split_generators`
def _generate_examples(self, filepath): def _generate_examples(self, filepath):
with open(filepath, encoding="utf-8") as f: with open(filepath, encoding="utf-8") as f:
for key, row in enumerate(f): json_data = json.load(f)['Data']
data = json.loads(row) for key, data in enumerate(json_data):
search_results = [] search_results = []
for search_result in data["SearchResults"]: for search_result in data["SearchResults"]:
search_results.append( search_results.append(
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment