sat_analogies.py

# Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""SAT Analogy Questions dataset."""


import os

import datasets


_CITATION = """\
@article{article,
    author = {Turney, Peter},
    year = {2006},
    month = {09},
    pages = {379-416},
    title = {Similarity of Semantic Relations},
    volume = {32},
    journal = {Computational Linguistics},
    doi = {10.1162/coli.2006.32.3.379}
}
"""

_DESCRIPTION = """\
SAT (Scholastic Aptitude Test) Analogy Questions is a dataset comprising 374
multiple-choice analogy questions; 5 choices per question.
"""

_HOMEPAGE = "https://aclweb.org/aclwiki/SAT_Analogy_Questions_(State_of_the_art)"

# TODO: Add the licence for the dataset here if you can find it
_LICENSE = ""


class SatAnalogies(datasets.GeneratorBasedBuilder):
    """SAT (Scholastic Aptitude Test) Analogy Questions is a dataset comprising 374 multiple-choice analogy questions."""

    VERSION = datasets.Version("0.0.1")

    BUILDER_CONFIGS = [
        datasets.BuilderConfig(
            name="sat_analogies",
            version=VERSION,
            description="The SAT Analogy Questions dataset",
        ),
    ]

    @property
    def manual_download_instructions(self):
        return (
            "To use SAT Analogy Questions you have to download it manually. Please "
            "email Peter Turney to request the data (https://www.apperceptual.com). "
            "Once you receive a download link for the dataset, supply the local path "
            "as the `data_dir` arg: "
            "`datasets.load_dataset('sat_analogies', data_dir='path/to/folder/folder_name')`"
        )

    def _info(self):
        features = datasets.Features(
            {
                "source": datasets.Value("string"),
                "stem": datasets.Value("string"),
                "choices": datasets.features.Sequence(datasets.Value("string")),
                "solution": datasets.Value("string"),
            }
        )
        return datasets.DatasetInfo(
            description=_DESCRIPTION,
            features=features,
            homepage=_HOMEPAGE,
            license=_LICENSE,
            citation=_CITATION,
        )

    def _split_generators(self, dl_manager):
        data_dir = os.path.abspath(os.path.expanduser(dl_manager.manual_dir))
        if not os.path.exists(data_dir):
            raise FileNotFoundError(
                f"{data_dir} does not exist. Make sure you insert a manual dir via `datasets.load_dataset('matinf', data_dir=...)` that includes SAT-package-V3.txt. Manual download instructions: {self.manual_download_instructions}"
            )
        return [
            datasets.SplitGenerator(
                name=datasets.Split.VALIDATION,
                # These kwargs will be passed to _generate_examples
                gen_kwargs={
                    "filepath": os.path.join(data_dir, "SAT-package-V3.txt"),
                },
            )
        ]

    # method parameters are unpacked from `gen_kwargs` as given in `_split_generators`
    def _generate_examples(self, filepath):
        data = []
        with open(filepath, "r", encoding="utf-8") as f:
            record = []
            for line in f:
                line = line.strip()
                if len(line) == 0 and record:
                    data.append(record)
                    record = []
                elif len(line) > 0 and line[0] == "#":
                    # Skip comments.
                    continue
                else:
                    record.append(line)
            data.append(record)
        for key, record in enumerate(data):
            source = record[-8]
            stem = record[-7]
            choices = record[-6:-1]
            solution = record[-1]
            yield key, {
                "source": source,
                "stem": stem,
                "choices": choices,
                "solution": solution,
            }