Merge branch 'master' into auto-batching

d6ceced5 · Stella Biderman · GitHub · 4d21ab6b · fc4428dc · d6ceced5
Unverified Commit d6ceced5 authored May 02, 2023 by Stella Biderman Committed by GitHub May 02, 2023
20 changed files
--- a/lm_eval/datasets/bigbench_resources/ruin_names.json
+++ b/lm_eval/datasets/bigbench_resources/ruin_names.json
--- a/lm_eval/datasets/bigbench_resources/salient_translation_error_detection.json
+++ b/lm_eval/datasets/bigbench_resources/salient_translation_error_detection.json
--- a/lm_eval/datasets/bigbench_resources/snarks.json
+++ b/lm_eval/datasets/bigbench_resources/snarks.json
--- a/lm_eval/datasets/bigbench_resources/sports_understanding.json
+++ b/lm_eval/datasets/bigbench_resources/sports_understanding.json
--- a/lm_eval/datasets/bigbench_resources/temporal_sequences.json
+++ b/lm_eval/datasets/bigbench_resources/temporal_sequences.json
--- a/lm_eval/datasets/bigbench_resources/tracking_shuffled_objects_five_objects.json
+++ b/lm_eval/datasets/bigbench_resources/tracking_shuffled_objects_five_objects.json
--- a/lm_eval/datasets/bigbench_resources/tracking_shuffled_objects_seven_objects.json
+++ b/lm_eval/datasets/bigbench_resources/tracking_shuffled_objects_seven_objects.json
--- a/lm_eval/datasets/bigbench_resources/tracking_shuffled_objects_three_objects.json
+++ b/lm_eval/datasets/bigbench_resources/tracking_shuffled_objects_three_objects.json
--- a/lm_eval/datasets/wikitext/__init__.py
+++ b/lm_eval/datasets/wikitext/__init__.py
--- a/lm_eval/datasets/wikitext/dataset_infos.json
+++ b/lm_eval/datasets/wikitext/dataset_infos.json
-{"wikitext-103-v1": {"description": " The WikiText language modeling dataset is a collection of over 100 million tokens extracted from the set of verified\n Good and Featured articles on Wikipedia. The dataset is available under the Creative Commons Attribution-ShareAlike\n License.\n", "citation": "@misc{merity2016pointer,\n      title={Pointer Sentinel Mixture Models},\n      author={Stephen Merity and Caiming Xiong and James Bradbury and Richard Socher},\n      year={2016},\n      eprint={1609.07843},\n      archivePrefix={arXiv},\n      primaryClass={cs.CL}\n}\n", "homepage": "https://blog.einstein.ai/the-wikitext-long-term-dependency-language-modeling-dataset/", "license": "Creative Commons Attribution-ShareAlike 4.0 International (CC BY-SA 4.0)", "features": {"page": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "wikitext", "config_name": "wikitext-103-v1", "version": {"version_str": "1.0.0", "description": null, "major": 1, "minor": 0, "patch": 0}, "splits": {"test": {"name": "test", "num_bytes": 1281262, "num_examples": 62, "dataset_name": "wikitext"}, "train": {"name": "train", "num_bytes": 539297488, "num_examples": 29444, "dataset_name": "wikitext"}, "validation": {"name": "validation", "num_bytes": 1142488, "num_examples": 60, "dataset_name": "wikitext"}}, "download_checksums": {"https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-103-v1.zip": {"num_bytes": 190229076, "checksum": "242ba0f20b329cfdf1ccc61e9e9e5b59becf189db7f7a81cd2a0e2fc31539590"}}, "download_size": 190229076, "post_processing_size": null, "dataset_size": 541721238, "size_in_bytes": 731950314}, "wikitext-2-v1": {"description": " The WikiText language modeling dataset is a collection of over 100 million tokens extracted from the set of verified\n Good and Featured articles on Wikipedia. The dataset is available under the Creative Commons Attribution-ShareAlike\n License.\n", "citation": "@misc{merity2016pointer,\n      title={Pointer Sentinel Mixture Models},\n      author={Stephen Merity and Caiming Xiong and James Bradbury and Richard Socher},\n      year={2016},\n      eprint={1609.07843},\n      archivePrefix={arXiv},\n      primaryClass={cs.CL}\n}\n", "homepage": "https://blog.einstein.ai/the-wikitext-long-term-dependency-language-modeling-dataset/", "license": "Creative Commons Attribution-ShareAlike 4.0 International (CC BY-SA 4.0)", "features": {"page": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "wikitext", "config_name": "wikitext-2-v1", "version": {"version_str": "1.0.0", "description": null, "major": 1, "minor": 0, "patch": 0}, "splits": {"test": {"name": "test", "num_bytes": 1256634, "num_examples": 62, "dataset_name": "wikitext"}, "train": {"name": "train", "num_bytes": 10799034, "num_examples": 629, "dataset_name": "wikitext"}, "validation": {"name": "validation", "num_bytes": 1121860, "num_examples": 60, "dataset_name": "wikitext"}}, "download_checksums": {"https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-v1.zip": {"num_bytes": 4475746, "checksum": "92675f1d63015c1c8b51f1656a52d5bdbc33aafa60cc47a218a66e7ee817488c"}}, "download_size": 4475746, "post_processing_size": null, "dataset_size": 13177528, "size_in_bytes": 17653274}, "wikitext-103-raw-v1": {"description": " The WikiText language modeling dataset is a collection of over 100 million tokens extracted from the set of verified\n Good and Featured articles on Wikipedia. The dataset is available under the Creative Commons Attribution-ShareAlike\n License.\n", "citation": "@misc{merity2016pointer,\n      title={Pointer Sentinel Mixture Models},\n      author={Stephen Merity and Caiming Xiong and James Bradbury and Richard Socher},\n      year={2016},\n      eprint={1609.07843},\n      archivePrefix={arXiv},\n      primaryClass={cs.CL}\n}\n", "homepage": "https://blog.einstein.ai/the-wikitext-long-term-dependency-language-modeling-dataset/", "license": "Creative Commons Attribution-ShareAlike 4.0 International (CC BY-SA 4.0)", "features": {"page": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "wikitext", "config_name": "wikitext-103-raw-v1", "version": {"version_str": "1.0.0", "description": null, "major": 1, "minor": 0, "patch": 0}, "splits": {"test": {"name": "test", "num_bytes": 1290775, "num_examples": 62, "dataset_name": "wikitext"}, "train": {"name": "train", "num_bytes": 540656522, "num_examples": 29444, "dataset_name": "wikitext"}, "validation": {"name": "validation", "num_bytes": 1147025, "num_examples": 60, "dataset_name": "wikitext"}}, "download_checksums": {"https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-103-raw-v1.zip": {"num_bytes": 191984949, "checksum": "91c00ae287f0d699e18605c84afc9e45c192bc6b7797ff8837e5474655a33794"}}, "download_size": 191984949, "post_processing_size": null, "dataset_size": 543094322, "size_in_bytes": 735079271}, "wikitext-2-raw-v1": {"description": " The WikiText language modeling dataset is a collection of over 100 million tokens extracted from the set of verified\n Good and Featured articles on Wikipedia. The dataset is available under the Creative Commons Attribution-ShareAlike\n License.\n", "citation": "@misc{merity2016pointer,\n      title={Pointer Sentinel Mixture Models},\n      author={Stephen Merity and Caiming Xiong and James Bradbury and Richard Socher},\n      year={2016},\n      eprint={1609.07843},\n      archivePrefix={arXiv},\n      primaryClass={cs.CL}\n}\n", "homepage": "https://blog.einstein.ai/the-wikitext-long-term-dependency-language-modeling-dataset/", "license": "Creative Commons Attribution-ShareAlike 4.0 International (CC BY-SA 4.0)", "features": {"page": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "wikitext", "config_name": "wikitext-2-raw-v1", "version": {"version_str": "1.0.0", "description": null, "major": 1, "minor": 0, "patch": 0}, "splits": {"test": {"name": "test", "num_bytes": 1290775, "num_examples": 62, "dataset_name": "wikitext"}, "train": {"name": "train", "num_bytes": 10942633, "num_examples": 629, "dataset_name": "wikitext"}, "validation": {"name": "validation", "num_bytes": 1147025, "num_examples": 60, "dataset_name": "wikitext"}}, "download_checksums": {"https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-raw-v1.zip": {"num_bytes": 4721645, "checksum": "ef7edb566e3e2b2d31b29c1fdb0c89a4cc683597484c3dc2517919c615435a11"}}, "download_size": 4721645, "post_processing_size": null, "dataset_size": 13380433, "size_in_bytes": 18102078}}
--- a/lm_eval/datasets/wikitext/wikitext.py
+++ b/lm_eval/datasets/wikitext/wikitext.py
-# Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# NOTE: This is a modified version of https://github.com/huggingface/datasets/blob/master/datasets/wikitext/wikitext.py
-# that returns Wiki pages instead of Wiki text line-by-line.
-"""WikiText Dataset."""
-import os
-import datasets
-_CITATION = """\
-@misc{merity2016pointer,
-      title={Pointer Sentinel Mixture Models},
-      author={Stephen Merity and Caiming Xiong and James Bradbury and Richard Socher},
-      year={2016},
-      eprint={1609.07843},
-      archivePrefix={arXiv},
-      primaryClass={cs.CL}
-}
-"""
-_DESCRIPTION = """\
- The WikiText language modeling dataset is a collection of over 100 million tokens extracted from the set of verified
- Good and Featured articles on Wikipedia. The dataset is available under the Creative Commons Attribution-ShareAlike
- License.
-"""
-_HOMEPAGE = "https://blog.einstein.ai/the-wikitext-long-term-dependency-language-modeling-dataset/"
-_LICENSE = "Creative Commons Attribution-ShareAlike 4.0 International (CC BY-SA 4.0)"
-_DATA_URL = "https://s3.amazonaws.com/research.metamind.io/wikitext"
-class WikitextConfig(datasets.BuilderConfig):
-    """BuilderConfig for GLUE."""
-    def __init__(self, data_url, **kwargs):
-        """BuilderConfig for Wikitext
-        Args:
-          data_url: `string`, url to the dataset (word or raw level)
-          **kwargs: keyword arguments forwarded to super.
-        """
-        super(WikitextConfig, self).__init__(
-            version=datasets.Version(
-                "1.0.0",
-            ),
-            **kwargs,
-        )
-        self.data_url = data_url
-class Wikitext(datasets.GeneratorBasedBuilder):
-    """TODO(wikitext_103): Short description of my dataset."""
-    # TODO(wikitext_103): Set up version.
-    VERSION = datasets.Version("0.1.0")
-    BUILDER_CONFIGS = [
-        WikitextConfig(
-            name="wikitext-103-v1",
-            data_url=_DATA_URL + "/" + "wikitext-103-v1.zip",
-            description="Word level dataset. No processing is needed other than replacing newlines with <eos> tokens.",
-        ),
-        WikitextConfig(
-            name="wikitext-2-v1",
-            data_url=_DATA_URL + "/" + "wikitext-2-v1.zip",
-            description="Word level dataset. No processing is needed other than replacing newlines with <eos> tokens.",
-        ),
-        WikitextConfig(
-            name="wikitext-103-raw-v1",
-            data_url=_DATA_URL + "/" + "wikitext-103-raw-v1.zip",
-            description="Raw level dataset: the raw tokens before the addition of <unk> tokens. "
-            "They should only be used for character level work or for creating newly derived datasets.",
-        ),
-        WikitextConfig(
-            name="wikitext-2-raw-v1",
-            data_url=_DATA_URL + "/" + "wikitext-2-raw-v1.zip",
-            description="Raw level dataset: the raw tokens before the addition of <unk> tokens. "
-            "They should only be used for character level work or for creating newly derived datasets.",
-        ),
-    ]
-    def _info(self):
-        # TODO(wikitext): Specifies the datasets.DatasetInfo object
-        return datasets.DatasetInfo(
-            # This is the description that will appear on the datasets page.
-            description=_DESCRIPTION,
-            # datasets.features.FeatureConnectors
-            features=datasets.Features(
-                {
-                    "page": datasets.Value("string")
-                    # These are the features of your dataset like images, labels ...
-                }
-            ),
-            # If there's a common (input, target) tuple from the features,
-            # specify them here. They'll be used if as_supervised=True in
-            # builder.as_dataset.
-            supervised_keys=None,
-            homepage=_HOMEPAGE,
-            license=_LICENSE,
-            citation=_CITATION,
-        )
-    def _split_generators(self, dl_manager):
-        """Returns SplitGenerators."""
-        # TODO(wikitext): Downloads the data and defines the splits
-        # dl_manager is a datasets.download.DownloadManager that can be used to
-        # download and extract URLs
-        if self.config.name == "wikitext-103-v1":
-            data_file = dl_manager.download_and_extract(self.config.data_url)
-            data_dir = os.path.join(data_file, "wikitext-103")
-            return [
-                datasets.SplitGenerator(
-                    name=datasets.Split.TEST,
-                    gen_kwargs={
-                        "data_file": os.path.join(data_dir, "wiki.test.tokens"),
-                        "split": "test",
-                    },
-                ),
-                datasets.SplitGenerator(
-                    name=datasets.Split.TRAIN,
-                    gen_kwargs={
-                        "data_file": os.path.join(data_dir, "wiki.train.tokens"),
-                        "split": "train",
-                    },
-                ),
-                datasets.SplitGenerator(
-                    name=datasets.Split.VALIDATION,
-                    gen_kwargs={
-                        "data_file": os.path.join(data_dir, "wiki.valid.tokens"),
-                        "split": "valid",
-                    },
-                ),
-            ]
-        else:
-            if self.config.name == "wikitext-103-raw-v1":
-                data_file = dl_manager.download_and_extract(self.config.data_url)
-                data_dir = os.path.join(data_file, "wikitext-103-raw")
-                return [
-                    datasets.SplitGenerator(
-                        name=datasets.Split.TEST,
-                        gen_kwargs={
-                            "data_file": os.path.join(data_dir, "wiki.test.raw"),
-                            "split": "test",
-                        },
-                    ),
-                    datasets.SplitGenerator(
-                        name=datasets.Split.TRAIN,
-                        gen_kwargs={
-                            "data_file": os.path.join(data_dir, "wiki.train.raw"),
-                            "split": "train",
-                        },
-                    ),
-                    datasets.SplitGenerator(
-                        name=datasets.Split.VALIDATION,
-                        gen_kwargs={
-                            "data_file": os.path.join(data_dir, "wiki.valid.raw"),
-                            "split": "valid",
-                        },
-                    ),
-                ]
-            else:
-                if self.config.name == "wikitext-2-raw-v1":
-                    data_file = dl_manager.download_and_extract(self.config.data_url)
-                    data_dir = os.path.join(data_file, "wikitext-2-raw")
-                    return [
-                        datasets.SplitGenerator(
-                            name=datasets.Split.TEST,
-                            gen_kwargs={
-                                "data_file": os.path.join(data_dir, "wiki.test.raw"),
-                                "split": "test",
-                            },
-                        ),
-                        datasets.SplitGenerator(
-                            name=datasets.Split.TRAIN,
-                            gen_kwargs={
-                                "data_file": os.path.join(data_dir, "wiki.train.raw"),
-                                "split": "train",
-                            },
-                        ),
-                        datasets.SplitGenerator(
-                            name=datasets.Split.VALIDATION,
-                            gen_kwargs={
-                                "data_file": os.path.join(data_dir, "wiki.valid.raw"),
-                                "split": "valid",
-                            },
-                        ),
-                    ]
-                else:
-                    if self.config.name == "wikitext-2-v1":
-                        data_file = dl_manager.download_and_extract(
-                            self.config.data_url
-                        )
-                        data_dir = os.path.join(data_file, "wikitext-2")
-                        return [
-                            datasets.SplitGenerator(
-                                name=datasets.Split.TEST,
-                                gen_kwargs={
-                                    "data_file": os.path.join(
-                                        data_dir, "wiki.test.tokens"
-                                    ),
-                                    "split": "test",
-                                },
-                            ),
-                            datasets.SplitGenerator(
-                                name=datasets.Split.TRAIN,
-                                gen_kwargs={
-                                    "data_file": os.path.join(
-                                        data_dir, "wiki.train.tokens"
-                                    ),
-                                    "split": "train",
-                                },
-                            ),
-                            datasets.SplitGenerator(
-                                name=datasets.Split.VALIDATION,
-                                gen_kwargs={
-                                    "data_file": os.path.join(
-                                        data_dir, "wiki.valid.tokens"
-                                    ),
-                                    "split": "valid",
-                                },
-                            ),
-                        ]
-    def _generate_examples(self, data_file, split):
-        """Yields examples."""
-        with open(data_file, encoding="utf-8") as f:
-            key = 0
-            ret = []
-            data = f.read().split("\n")
-            for line in data:
-                rline = line.replace("= = =", "===").replace("= =", "==").strip()
-                if rline.startswith("= ") and rline.strip().endswith(" ="):
-                    page = "\n".join(ret)
-                    if page.strip():
-                        yield key, {"page": page}
-                        key += 1
-                    ret = []
-                ret.append(line)
-            page = "\n".join(ret)
-            yield key, {"page": page}
--- a/lm_eval/models/__init__.py
+++ b/lm_eval/models/__init__.py
@@ -6,7 +6,8 @@ from . import dummy
 MODEL_REGISTRY = {
    "hf": gpt2.HFLM,
-    "hf-causal": huggingface.AutoCausalLM,
+    "hf-causal": gpt2.HFLM,
+    "hf-causal-experimental": huggingface.AutoCausalLM,
    "hf-seq2seq": huggingface.AutoSeq2SeqLM,
    "gpt2": gpt2.GPT2LM,
    "gpt3": gpt3.GPT3LM,

--- a/lm_eval/models/gpt2.py
+++ b/lm_eval/models/gpt2.py
 import torch
 import transformers
+from typing import Optional
 from lm_eval.base import BaseLM
 class HFLM(BaseLM):
@@ -12,6 +13,8 @@ class HFLM(BaseLM):
        subfolder=None,
        tokenizer=None,
        batch_size=1,
+        load_in_8bit: Optional[bool] = False,
+        trust_remote_code: Optional[bool] = False,
    ):
        super().__init__()
@@ -19,9 +22,8 @@ class HFLM(BaseLM):
        assert isinstance(pretrained, str)
        assert isinstance(batch_size, (int,str))
-        if device:
+        device_list = set(["cuda", "cpu"] + [f'cuda:{i}' for i in range(torch.cuda.device_count())])
-            if device not in ["cuda", "cpu"]:
+        if device and device in device_list:
-                device = int(device)
            self._device = torch.device(device)
            print(f"Using device '{device}'")
        else:
@@ -37,25 +39,20 @@ class HFLM(BaseLM):
        revision = revision + ("/" + subfolder if subfolder is not None else "")
        self.gpt2 = transformers.AutoModelForCausalLM.from_pretrained(
-            pretrained, revision=revision, low_cpu_mem_usage=low_cpu_mem_usage
+            pretrained,
+            load_in_8bit=load_in_8bit,
+            low_cpu_mem_usage=low_cpu_mem_usage,
+            revision=revision,
+            trust_remote_code=trust_remote_code,
        ).to(self.device)
        self.gpt2.eval()
        self.tokenizer = transformers.AutoTokenizer.from_pretrained(
            pretrained if tokenizer is None else tokenizer,
            revision=revision,
+            trust_remote_code=trust_remote_code,
        )
-        assert isinstance(
-            self.tokenizer,
-            (
-                transformers.GPT2Tokenizer,
-                transformers.GPT2TokenizerFast,
-                transformers.T5Tokenizer,
-                transformers.T5TokenizerFast,
-            ),
-        ), "this tokenizer has not been checked for compatibility yet!"
        self.vocab_size = self.tokenizer.vocab_size
        if isinstance(
@@ -74,7 +71,6 @@ class HFLM(BaseLM):
        else:
            self.batch_size_per_gpu = int(batch_size) 
    @property
    def eot_token_id(self):
        # we use EOT because end of *text* is more accurate for what we're doing than end of *sentence*
@@ -120,9 +116,10 @@ class HFLM(BaseLM):
            return self.gpt2(inps)[0]
    def _model_generate(self, context, max_length, eos_token_id):
-        return self.gpt2.generate(
+        generation_kwargs = {'do_sample': False, 'max_length': max_length}
-            context, max_length=max_length, eos_token_id=eos_token_id, do_sample=False
+        if eos_token_id is not None:
-        )
+            generation_kwargs['eos_token_id'] = eos_token_id
+        return self.gpt2.generate(context, **generation_kwargs)
 # for backwards compatibility

--- a/lm_eval/models/huggingface.py
+++ b/lm_eval/models/huggingface.py
--- a/lm_eval/models/textsynth.py
+++ b/lm_eval/models/textsynth.py
@@ -123,7 +123,8 @@ class TextSynthLM(BaseLM):
        res = []
        for request in tqdm(requests):
            inp = request[0]
-            until = request[1]
+            request_args = request[1]
+            until = request_args['until']
            response = textsynth_completion(
                url=self.api_url + "/v1/engines/" + self.engine + "/completions",
                headers={"Authorization": "Bearer " + self.api_key},

--- a/lm_eval/tasks/__init__.py
+++ b/lm_eval/tasks/__init__.py
@@ -52,6 +52,7 @@ from . import gsm8k
 from . import storycloze
 from . import toxigen
 from . import crowspairs
+from . import bigbench
 ########################################
 # Translation tasks
@@ -310,6 +311,7 @@ TASK_REGISTRY = {
    # "storycloze_2016": storycloze.StoryCloze2016,
    # "storycloze_2018": storycloze.StoryCloze2018,
    # "sat": sat.SATAnalogies,
+    **bigbench.create_all_tasks(),
 }

--- a/lm_eval/tasks/arithmetic.py
+++ b/lm_eval/tasks/arithmetic.py
@@ -7,8 +7,6 @@ problem in natural language.
 Homepage: https://github.com/openai/gpt-3/tree/master/data
 """
-import inspect
-import lm_eval.datasets.arithmetic.arithmetic
 from lm_eval.base import Task, rf
 from lm_eval.metrics import mean
@@ -30,7 +28,7 @@ _CITATION = """
 class Arithmetic(Task):
    VERSION = 0
-    DATASET_PATH = inspect.getfile(lm_eval.datasets.arithmetic.arithmetic)
+    DATASET_PATH = "EleutherAI/arithmetic"
    def has_training_docs(self):
        return False

--- a/lm_eval/tasks/bigbench.py
+++ b/lm_eval/tasks/bigbench.py
--- a/lm_eval/tasks/coqa.py
+++ b/lm_eval/tasks/coqa.py
--- a/lm_eval/tasks/drop.py
+++ b/lm_eval/tasks/drop.py