merge with upstream

a702689d · Alexander · 8d66cfef · 008fc2a2 · a702689d · a702689d
Commit a702689d authored Nov 16, 2023 by Alexander
20 changed files
--- a/.gitignore
+++ b/.gitignore
@@ -3,3 +3,5 @@ env
 data/
 lm_cache
 .idea
+
+*.egg-info/
--- a/CODEOWNERS
+++ b/CODEOWNERS
-* @jon-tow @StellaAthena @haileyschoelkopf @lintangsutawika
+* @haileyschoelkopf @lintangsutawika @StellaAthena
--- a/Dockerfile
+++ b/Dockerfile
+FROM nvidia/cuda:11.2.2-cudnn8-runtime-ubuntu20.04
+
+
+### Install python 3.10 and set it as default python interpreter
+RUN  apt update &&  apt install software-properties-common -y && \
+add-apt-repository ppa:deadsnakes/ppa -y &&  apt update && \
+apt install curl -y && \
+apt install python3.10 -y && \
+update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.10 1 && \
+update-alternatives --install /usr/bin/python python /usr/bin/python3.10 1 && \
+apt install python3.10-venv python3.10-dev -y && \
+curl -Ss https://bootstrap.pypa.io/get-pip.py | python3.10 && \
+apt-get clean && rm -rf /var/lib/apt/lists/
+
+
+### Copy files
+COPY . /lm-evaluation-harness/
+
+### Set working directory
+
+WORKDIR /lm-evaluation-harness
+
+
+### Install requirements
+RUN pip install --no-cache-dir -e .
+### Run bash
+CMD ["/bin/bash"]
--- a/README.md
+++ b/README.md
 # Language Model Evaluation Harness

+## We're Refactoring LM-Eval!
+(as of 6/15/23)
+We have a revamp of the Evaluation Harness library internals staged on the [big-refactor](https://github.com/EleutherAI/lm-evaluation-harness/tree/big-refactor) branch! It is far along in progress, but before we start to move the `master` branch of the repository over to this new design with a new version release, we'd like to ensure that it's been tested by outside users and there are no glaring bugs.
+
+We’d like your help to test it out! you can help by:
+1. Trying out your current workloads on the big-refactor branch, and seeing if anything breaks or is counterintuitive,
+2. Porting tasks supported in the previous version of the harness to the new YAML configuration format. Please check out our [task implementation guide](https://github.com/EleutherAI/lm-evaluation-harness/blob/big-refactor/docs/new_task_guide.md) for more information.
+
+If you choose to port a task not yet completed according to [our checklist](https://github.com/EleutherAI/lm-evaluation-harness/blob/big-refactor/lm_eval/tasks/README.md), then you can contribute it by opening a PR containing [Refactor] in the name with:
+- A shell command to run the task in the `master` branch, and what the score is
+- A shell command to run the task in your PR branch to `big-refactor`, and what the resulting score is, to show that we achieve equality between the two implementations.
+
+Lastly, we'll no longer be accepting new feature requests beyond those that are already open to the master branch as we carry out this switch to the new version over the next week, though we will be accepting bugfixes to `master` branch and PRs to `big-refactor`. Feel free to reach out in the #lm-thunderdome channel of the EAI discord for more information.
+
+
 ## Overview

 This project provides a unified framework to test generative language models on a large number of different evaluation tasks.
@@ -131,6 +146,15 @@ python main.py \
    --tasks hellaswag
 ```

+GGUF or GGML quantized models can be loaded by using `llama-cpp-python` server:
+
+```bash
+python main.py \
+    --model gguf \
+    --model_args base_url=http://localhost:8000 \
+    --tasks hellaswag
+```
+
 We support wildcards in task names, for example you can run all of the machine-translated lambada tasks via `--task lambada_openai_mt_*`.

 We currently only support one prompt per task, which we strive to make the "standard" as defined by the benchmark's authors. If you would like to study how varying prompts causes changes in the evaluation score, check out the [BigScience fork](https://github.com/bigscience-workshop/lm-evaluation-harness) of this repo. We are currently working on upstreaming this capability to `main`.

--- a/docs/task_table.md
+++ b/docs/task_table.md
@@ -286,6 +286,13 @@
 |reversed_words                                           |     |✓  |    |        10000|acc                                                                                                                                                                              |
 |rte                                                      |✓    |✓  |    |          277|acc                                                                                                                                                                              |
 |sciq                                                     |✓    |✓  |✓   |         1000|acc, acc_norm                                                                                                                                                                    |
+|scrolls_contractnli                                      |✓    |✓  |    |         1037|em, acc, acc_norm                                                                                                                                                                |
+|scrolls_govreport                                        |✓    |✓  |    |          972|rouge1, rouge2, rougeL                                                                                                                                                           |
+|scrolls_narrativeqa                                      |✓    |✓  |    |         3425|f1                                                                                                                                                                               |
+|scrolls_qasper                                           |✓    |✓  |    |          984|f1                                                                                                                                                                               |
+|scrolls_qmsum                                            |✓    |✓  |    |          272|rouge1, rouge2, rougeL                                                                                                                                                           |
+|scrolls_quality                                          |✓    |✓  |    |         2086|em, acc, acc_norm                                                                                                                                                                |
+|scrolls_summscreenfd                                     |✓    |✓  |    |          338|rouge1, rouge2, rougeL                                                                                                                                                           |
 |squad2                                                   |✓    |✓  |    |        11873|exact, f1, HasAns_exact, HasAns_f1, NoAns_exact, NoAns_f1, best_exact, best_f1                                                                                                   |
 |sst                                                      |✓    |✓  |    |          872|acc                                                                                                                                                                              |
 |swag                                                     |✓    |✓  |    |        20006|acc, acc_norm                                                                                                                                                                    |
@@ -371,3 +378,55 @@
 |xwinograd_pt                                             |     |   |✓   |          263|acc                                                                                                                                                                              |
 |xwinograd_ru                                             |     |   |✓   |          315|acc                                                                                                                                                                              |
 |xwinograd_zh                                             |     |   |✓   |          504|acc                                                                                                                                                                              |
+| Ceval-valid-computer_network                         |   | ✓ |   | 19 | acc |
+| Ceval-valid-operating_system                         |   | ✓ |   | 19 | acc |
+| Ceval-valid-computer_architecture                    |   | ✓ |   | 21 | acc |
+| Ceval-valid-college_programming                      |   | ✓ |   | 37 | acc |
+| Ceval-valid-college_physics                          |   | ✓ |   | 19 | acc |
+| Ceval-valid-college_chemistry                        |   | ✓ |   | 24 | acc |
+| Ceval-valid-advanced_mathematics                     |   | ✓ |   | 19 | acc |
+| Ceval-valid-probability_and_statistics               |   | ✓ |   | 18 | acc |
+| Ceval-valid-discrete_mathematics                     |   | ✓ |   | 16 | acc |
+| Ceval-valid-electrical_engineer                      |   | ✓ |   | 37 | acc |
+| Ceval-valid-metrology_engineer                       |   | ✓ |   | 24 | acc |
+| Ceval-valid-high_school_mathematics                  |   | ✓ |   | 18 | acc |
+| Ceval-valid-high_school_physics                      |   | ✓ |   | 19 | acc |
+| Ceval-valid-high_school_chemistry                    |   | ✓ |   | 19 | acc |
+| Ceval-valid-high_school_biology                      |   | ✓ |   | 19 | acc |
+| Ceval-valid-middle_school_mathematics                |   | ✓ |   | 19 | acc |
+| Ceval-valid-middle_school_biology                    |   | ✓ |   | 21 | acc |
+| Ceval-valid-middle_school_physics                    |   | ✓ |   | 19 | acc |
+| Ceval-valid-middle_school_chemistry                  |   | ✓ |   | 20 | acc |
+| Ceval-valid-veterinary_medicine                      |   | ✓ |   | 23 | acc |
+| Ceval-valid-college_economics                        |   | ✓ |   | 55 | acc |
+| Ceval-valid-business_administration                  |   | ✓ |   | 33 | acc |
+| Ceval-valid-marxism                                  |   | ✓ |   | 19 | acc |
+| Ceval-valid-mao_zedong_thought                       |   | ✓ |   | 24 | acc |
+| Ceval-valid-education_science                        |   | ✓ |   | 29 | acc |
+| Ceval-valid-teacher_qualification                    |   | ✓ |   | 44 | acc |
+| Ceval-valid-high_school_politics                     |   | ✓ |   | 19 | acc |
+| Ceval-valid-high_school_geography                    |   | ✓ |   | 19 | acc |
+| Ceval-valid-middle_school_politics                   |   | ✓ |   | 21 | acc |
+| Ceval-valid-middle_school_geography                  |   | ✓ |   | 12 | acc |
+| Ceval-valid-modern_chinese_history                   |   | ✓ |   | 23 | acc |
+| Ceval-valid-ideological_and_moral_cultivation        |   | ✓ |   | 19 | acc |
+| Ceval-valid-logic                                    |   | ✓ |   | 22 | acc |
+| Ceval-valid-law                                      |   | ✓ |   | 24 | acc |
+| Ceval-valid-chinese_language_and_literature          |   | ✓ |   | 23 | acc |
+| Ceval-valid-art_studies                              |   | ✓ |   | 33 | acc |
+| Ceval-valid-professional_tour_guide                  |   | ✓ |   | 29 | acc |
+| Ceval-valid-legal_professional                       |   | ✓ |   | 23 | acc |
+| Ceval-valid-high_school_chinese                      |   | ✓ |   | 19 | acc |
+| Ceval-valid-high_school_history                      |   | ✓ |   | 20 | acc |
+| Ceval-valid-middle_school_history                    |   | ✓ |   | 22 | acc |
+| Ceval-valid-civil_servant                            |   | ✓ |   | 47 | acc |
+| Ceval-valid-sports_science                           |   | ✓ |   | 19 | acc |
+| Ceval-valid-plant_protection                         |   | ✓ |   | 22 | acc |
+| Ceval-valid-basic_medicine                           |   | ✓ |   | 19 | acc |
+| Ceval-valid-clinical_medicine                        |   | ✓ |   | 22 | acc |
+| Ceval-valid-urban_and_rural_planner                  |   | ✓ |   | 46 | acc |
+| Ceval-valid-accountant                               |   | ✓ |   | 49 | acc |
+| Ceval-valid-fire_engineer                            |   | ✓ |   | 31 | acc |
+| Ceval-valid-environmental_impact_assessment_engineer |   | ✓ |   | 31 | acc |
+| Ceval-valid-tax_accountant                           |   | ✓ |   | 49 | acc |
+| Ceval-valid-physician                                |   | ✓ |   | 49 | acc |
--- a/lm_eval/base.py
+++ b/lm_eval/base.py
@@ -7,7 +7,6 @@ import os
 import json
 import hashlib
 import datasets
-from sqlitedict import SqliteDict
 from tqdm import tqdm
 import torch
 import torch.nn.functional as F
@@ -119,6 +118,12 @@ class LM(abc.ABC):


 class BaseLM(LM):
+    def __init__(self):
+        super().__init__()
+        self.batch_schedule = 1
+        self.batch_sizes = {}
+        self.max_batch_size = 512
+
    @property
    @abstractmethod
    def eot_token_id(self):
@@ -167,6 +172,28 @@ class BaseLM(LM):
        """
        pass

+    def _detect_batch_size(self, requests=None, pos=0):
+        if requests:
+            _, context_enc, continuation_enc = requests[pos]
+            max_length = len(
+                (context_enc + continuation_enc)[-(self.max_length + 1) :][:-1]
+            )
+        else:
+            max_length = self.max_length
+
+        # if OOM, then halves batch_size and tries again
+        @find_executable_batch_size(starting_batch_size=self.max_batch_size)
+        def forward_batch(batch_size):
+            test_batch = torch.ones((batch_size, max_length), device=self.device).long()
+            for _ in range(5):
+                _ = F.log_softmax(self._model_call(test_batch), dim=-1).cpu()
+            return batch_size
+
+        batch_size = forward_batch()
+        utils.clear_torch_cache()
+
+        return batch_size
+
    # subclass must implement properties vocab_size, eot_token_id, max_gen_toks, batch_size, device, max_length.
    # TODO: enforce this somehow

@@ -186,7 +213,9 @@ class BaseLM(LM):
        for context, continuation in requests:
            if context == "":
                # end of text as context
-                context_enc, continuation_enc = [self.eot_token_id], self.tok_encode(continuation)
+                context_enc, continuation_enc = [self.eot_token_id], self.tok_encode(
+                    continuation
+                )
            else:
                context_enc, continuation_enc = self._encode_pair(context, continuation)

@@ -202,19 +231,7 @@ class BaseLM(LM):
        if self.batch_size == "auto":
            # using rolling window with maximum context
            print("Passed argument batch_size = auto. Detecting largest batch size")
-
-            @find_executable_batch_size(
-                starting_batch_size=512
-            )  # if OOM, then halves batch_size and tries again
-            def forward_batch(batch_size):
-                test_batch = torch.ones(
-                    (batch_size, self.max_length), device=self.device
-                ).long()
-                for _ in range(5):
-                    _ = F.log_softmax(self._model_call(test_batch), dim=-1).cpu()
-                return batch_size
-
-            batch_size = forward_batch()
+            batch_size = self._detect_batch_size()
            print(f"Determined Largest batch size: {batch_size}")
            adaptive_batch_size = batch_size

@@ -267,34 +284,34 @@ class BaseLM(LM):

        re_ord = utils.Reorderer(requests, _collate)

+        reordered_requests = re_ord.get_reordered()
+        n_reordered_requests = len(reordered_requests)
+
        # automatic (variable) batch size detection for vectorization
        # pull longest context sample from request
-        if len(re_ord.get_reordered()) > 0:
-            _, context_enc, continuation_enc = re_ord.get_reordered()[0]
-            max_context = len((context_enc + continuation_enc)[-(self.max_length + 1) :][:-1])
-            if (self.batch_size == 'auto'):
-
-                if override_bs is None:
-                    print('Passed argument batch_size = auto. Detecting largest batch size')
-                    @find_executable_batch_size(starting_batch_size=512) # if OOM, then halves batch_size and tries again
-                    def forward_batch(batch_size):
-                        test_batch = torch.ones((batch_size, max_context), device=self.device).long()
-                        for _ in range(5):
-                            out = F.log_softmax(self._model_call(test_batch), dim = -1).cpu()
-                        return batch_size
-
-                    batch_size = forward_batch()
-                    print(f"Determined largest batch size: {batch_size}")
-                    adaptive_batch_size = batch_size
-
-                else:
-                    adaptive_batch_size = override_bs
-        else:
-            adaptive_batch_size = 0 if override_bs is None else override_bs
+        def _batch_scheduler(pos):
+            sched = pos // int(n_reordered_requests / self.batch_schedule)
+            if sched in self.batch_sizes:
+                return self.batch_sizes[sched]
+            print(
+                f"Passed argument batch_size = auto:{self.batch_schedule}. Detecting largest batch size"
+            )
+            self.batch_sizes[sched] = self._detect_batch_size(reordered_requests, pos)
+            print(f"Determined largest batch size: {self.batch_sizes[sched]}")
+            return self.batch_sizes[sched]

        for chunk in utils.chunks(
-            tqdm(re_ord.get_reordered(), disable=disable_tqdm),
-            self.batch_size if self.batch_size != "auto" else adaptive_batch_size,
+            tqdm(reordered_requests, disable=disable_tqdm),
+            n=self.batch_size
+            if self.batch_size != "auto"
+            else override_bs
+            if override_bs is not None
+            else 0,
+            fn=_batch_scheduler
+            if self.batch_size == "auto"
+            and n_reordered_requests > 0
+            and not override_bs
+            else None,
        ):
            inps = []
            cont_toks_list = []
@@ -348,7 +365,7 @@ class BaseLM(LM):
                cont_toks_list.append(cont)
                inplens.append(inplen)

-            batched_inps = torch.cat(inps, dim=0)  # [batch, padding_length
+            batched_inps = torch.cat(inps, dim=0)  # [batch, padding_length]
            multi_logits = F.log_softmax(
                self._model_call(batched_inps), dim=-1
            ).cpu()  # [batch, padding_length, vocab]
@@ -359,6 +376,9 @@ class BaseLM(LM):

                # Slice to original seq length
                contlen = len(cont_toks)
+                inplen = inplen + (
+                    logits.shape[0] - padding_length
+                )  # if "virtual tokens" (from prompt tuning) are added, inplen is larger
                logits = logits[inplen - contlen : inplen].unsqueeze(
                    0
                )  # [1, seq, vocab]
@@ -395,18 +415,34 @@ class BaseLM(LM):
        res = []

        def _collate(x):
+            # the negative sign on len(toks) sorts descending - this has a few advantages:
+            # - time estimates will always be over not underestimates, which is more useful for planning
+            # - to know the size of a batch when going through the list, you know the first one is always the batch
+            #   padded context length. this is useful to simplify the batching logic and more importantly to make
+            #   automatic adaptive batches much much easier to implement
+            # - any OOMs will happen right away rather than near the end
+
            toks = self.tok_encode(x[0])
-            return len(toks), x[0]
+            return -len(toks), x[0]

        re_ord = utils.Reorderer(requests, _collate)

+        warn_stop_seq = False
        for context, request_args in tqdm(re_ord.get_reordered()):
            until = request_args["until"]
            if isinstance(until, str):
                until = [until]

            if until:
-                (primary_until,) = self.tok_encode(until[0])
+                try:
+                    (primary_until,) = self.tok_encode(until[0])
+                except ValueError:
+                    if not warn_stop_seq:
+                        print(
+                            "Warning: a primary stop sequence is multi-token! Will default to EOS token for this tokenizer. Consider using `hf-causal-experimental` for multi-token stop sequence support for the time being."
+                        )
+                        warn_stop_seq = True
+                    primary_until = self.eot_token_id
            else:
                primary_until = None

@@ -854,6 +890,7 @@ class CachingLM:
        :param cache_db: str
            Path to cache db
        """
+        from sqlitedict import SqliteDict
        self.lm = lm
        self.cache_db = cache_db
        if os.path.dirname(cache_db):
@@ -864,6 +901,10 @@ class CachingLM:
        lm.set_cache_hook(self.get_cache_hook())

    def __getattr__(self, attr):
+        lm_attr = getattr(self.lm, attr)
+        if not callable(lm_attr):
+            return lm_attr
+
        def fn(requests):
            res = []
            remaining_reqs = []

--- a/lm_eval/datasets/asdiv/asdiv.py
+++ b/lm_eval/datasets/asdiv/asdiv.py
@@ -43,8 +43,8 @@ level (for indicating the level of difficulty).

 _HOMEPAGE = "https://github.com/chaochun/nlu-asdiv-dataset"

-# TODO: Add the licence for the dataset here if you can find it
-_LICENSE = ""
+# License declared at https://github.com/chaochun/nlu-asdiv-dataset/blob/master/README.md
+_LICENSE = "CC BY-NC 4.0"

 _URLS = "https://github.com/chaochun/nlu-asdiv-dataset/archive/55790e5270bb91ccfa5053194b25732534696b50.zip"


--- a/lm_eval/datasets/triviaqa/__init__.py
+++ b/lm_eval/datasets/triviaqa/__init__.py
--- a/lm_eval/datasets/coqa/coqa.py
+++ b/lm_eval/datasets/coqa/coqa.py
@@ -44,8 +44,7 @@ appear in a conversation.

 _HOMEPAGE = "https://stanfordnlp.github.io/coqa/"

-# TODO: Add the licence for the dataset here if you can find it
-_LICENSE = ""
+_LICENSE = "Different licenses depending on the content (see https://stanfordnlp.github.io/coqa/ for details)"

 _URLS = {
    "train": "https://nlp.stanford.edu/data/coqa/coqa-train-v1.0.json",

--- a/lm_eval/datasets/drop/drop.py
+++ b/lm_eval/datasets/drop/drop.py
@@ -43,8 +43,8 @@ and perform discrete operations over them (such as addition, counting, or sortin

 _HOMEPAGE = "https://allenai.org/data/drop"

-# TODO: Add the licence for the dataset here if you can find it
-_LICENSE = ""
+# License declared at https://allenai.org/data/drop
+_LICENSE = "CC BY"

 _URLS = {
    "drop": "https://s3-us-west-2.amazonaws.com/allennlp/datasets/drop/drop_dataset.zip",

--- a/lm_eval/datasets/headqa/headqa.py
+++ b/lm_eval/datasets/headqa/headqa.py
@@ -51,7 +51,10 @@ The dataset contains questions about the following topics: medicine, nursing, ps

 _HOMEPAGE = "https://aghie.github.io/head-qa/"

-_LICENSE = "MIT License"
+# The Spanish data comes from the "Ministerio de Sanidad, Consumo y Bienestar Social", as indicated here : https://github.com/aghie/head-qa
+# This Spanish data seems to follow the intellectual property rights stated here : https://www.sanidad.gob.es/avisoLegal/home.htm
+# The English data was translated by the authors of head-qa (https://arxiv.org/pdf/1906.04701.pdf).
+_LICENSE = "Custom license"

 _URL = "https://drive.google.com/uc?export=download&confirm=t&id=1a_95N5zQQoUCq8IBNVZgziHbeM-QxG2t"


--- a/lm_eval/datasets/hendrycks_ethics/hendrycks_ethics.py
+++ b/lm_eval/datasets/hendrycks_ethics/hendrycks_ethics.py
@@ -41,8 +41,10 @@ learning agents.

 _HOMEPAGE = "https://github.com/hendrycks/ethics"

-# TODO: Add the licence for the dataset here if you can find it
-_LICENSE = ""
+# The authors declared that the dataset is not distributed under a copyright or intellectual property (https://arxiv.org/pdf/2008.02275.pdf)
+# On Hugging Face, the dataset is distributed under the MIT license (https://huggingface.co/datasets/hendrycks/ethics)
+# The common sense portion is from Reddit and might incur some licensing complications.
+_LICENSE = "Ambiguous"

 _URLS = "https://people.eecs.berkeley.edu/~hendrycks/ethics.tar"


--- a/lm_eval/datasets/hendrycks_math/hendrycks_math.py
+++ b/lm_eval/datasets/hendrycks_math/hendrycks_math.py
@@ -38,8 +38,8 @@ models to generate answer derivations and explanations.

 _HOMEPAGE = "https://github.com/hendrycks/math"

-# TODO: Add the licence for the dataset here if you can find it
-_LICENSE = ""
+# License declared at https://arxiv.org/pdf/2103.03874.pdf
+_LICENSE = "MIT License"

 _URLS = "https://people.eecs.berkeley.edu/~hendrycks/MATH.tar"


--- a/lm_eval/datasets/logiqa/logiqa.py
+++ b/lm_eval/datasets/logiqa/logiqa.py
@@ -38,8 +38,7 @@ NLP setting.

 _HOMEPAGE = "https://github.com/lgw863/LogiQA-dataset"

-# TODO: Add the licence for the dataset here if you can find it
-_LICENSE = ""
+_LICENSE = "No license found"

 _URLS = {
    "train": "https://raw.githubusercontent.com/lgw863/LogiQA-dataset/master/Train.txt",

--- a/lm_eval/datasets/mutual/mutual.py
+++ b/lm_eval/datasets/mutual/mutual.py
@@ -38,8 +38,7 @@ modified from Chinese high school English listening comprehension test data.

 _HOMEPAGE = "https://github.com/Nealcly/MuTual"

-# TODO: Add the licence for the dataset here if you can find it
-_LICENSE = ""
+_LICENSE = "No license found"

 _URLS = "https://github.com/Nealcly/MuTual/archive/master.zip"


--- a/lm_eval/datasets/pile/pile.py
+++ b/lm_eval/datasets/pile/pile.py
@@ -38,8 +38,8 @@ math, computer science, and philosophy papers.

 _HOMEPAGE = "https://pile.eleuther.ai/"

-# TODO: Add the licence for the dataset here if you can find it
-_LICENSE = ""
+# More details at https://arxiv.org/pdf/2201.07311.pdf
+_LICENSE = "Multiple licenses"

 _URLS = {
    "validation": "https://the-eye.eu/public/AI/pile/val.jsonl.zst",

--- a/lm_eval/datasets/quac/quac.py
+++ b/lm_eval/datasets/quac/quac.py
@@ -39,8 +39,8 @@ a teacher who answers the questions by providing short excerpts (spans) from the

 _HOMEPAGE = "https://quac.ai/"

-# TODO: Add the licence for the dataset here if you can find it
-_LICENSE = ""
+# License declared at https://quac.ai/
+_LICENSE = "CC BY-SA 4.0"

 _URLS = {
    "train": "https://s3.amazonaws.com/my89public/quac/train_v0.2.json",

--- a/lm_eval/datasets/sat_analogies/sat_analogies.py
+++ b/lm_eval/datasets/sat_analogies/sat_analogies.py
@@ -39,8 +39,7 @@ multiple-choice analogy questions; 5 choices per question.

 _HOMEPAGE = "https://aclweb.org/aclwiki/SAT_Analogy_Questions_(State_of_the_art)"

-# TODO: Add the licence for the dataset here if you can find it
-_LICENSE = ""
+_LICENSE = "No license found"


 class SatAnalogies(datasets.GeneratorBasedBuilder):

--- a/lm_eval/datasets/triviaqa/README.md
+++ b/lm_eval/datasets/triviaqa/README.md
---
-dataset_info:
-  features:
-  - name: question_id
-    dtype: string
-  - name: question_source
-    dtype: string
-  - name: question
-    dtype: string
-  - name: answer
-    struct:
-    - name: aliases
-      sequence: string
-    - name: value
-      dtype: string
-  - name: search_results
-    sequence:
-    - name: description
-      dtype: string
-    - name: filename
-      dtype: string
-    - name: rank
-      dtype: int32
-    - name: title
-      dtype: string
-    - name: url
-      dtype: string
-    - name: search_context
-      dtype: string
-  config_name: triviaqa
-  splits:
-  - name: train
-    num_bytes: 1270894387
-    num_examples: 87622
-  - name: validation
-    num_bytes: 163755044
-    num_examples: 11313
-  download_size: 632549060
-  dataset_size: 1434649431
---
--- a/lm_eval/datasets/triviaqa/dataset_infos.json
+++ b/lm_eval/datasets/triviaqa/dataset_infos.json
-{"triviaqa": {"description": "TriviaQA is a reading comprehension dataset containing over 650K question-answer-evidence\ntriples. TriviaQA includes 95K question-answer pairs authored by trivia enthusiasts\nand independently gathered evidence documents, six per question on average, that provide\nhigh quality distant supervision for answering the questions.\n", "citation": "@InProceedings{JoshiTriviaQA2017,\n    author = {Joshi, Mandar and Choi, Eunsol and Weld, Daniel S. and Zettlemoyer, Luke},\n    title = {TriviaQA: A Large Scale Distantly Supervised Challenge Dataset for Reading Comprehension},\n    booktitle = {Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics},\n    month = {July},\n    year = {2017},\n    address = {Vancouver, Canada},\n    publisher = {Association for Computational Linguistics},\n}\n", "homepage": "https://nlp.cs.washington.edu/triviaqa/", "license": "Apache License 2.0", "features": {"question_id": {"dtype": "string", "id": null, "_type": "Value"}, "question_source": {"dtype": "string", "id": null, "_type": "Value"}, "question": {"dtype": "string", "id": null, "_type": "Value"}, "answer": {"aliases": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "value": {"dtype": "string", "id": null, "_type": "Value"}}, "search_results": {"feature": {"description": {"dtype": "string", "id": null, "_type": "Value"}, "filename": {"dtype": "string", "id": null, "_type": "Value"}, "rank": {"dtype": "int32", "id": null, "_type": "Value"}, "title": {"dtype": "string", "id": null, "_type": "Value"}, "url": {"dtype": "string", "id": null, "_type": "Value"}, "search_context": {"dtype": "string", "id": null, "_type": "Value"}}, "length": -1, "id": null, "_type": "Sequence"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "triviaqa", "config_name": "triviaqa", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"train": {"name": "train", "num_bytes": 1271393601, "num_examples": 87622, "dataset_name": "triviaqa"}, "validation": {"name": "validation", "num_bytes": 163819509, "num_examples": 11313, "dataset_name": "triviaqa"}}, "download_checksums": {"http://eaidata.bmk.sh/data/triviaqa-unfiltered.tar.gz": {"num_bytes": 546481381, "checksum": "adc19b42769062d241a8fbe834c56e58598d9322eb6c614e9f33a68a2cf5523e"}}, "download_size": 546481381, "post_processing_size": null, "dataset_size": 1435213110, "size_in_bytes": 1981694491}}