Hosting wikitext on HuggingFace (#396)

* host wikitext on hf * transfer wikitext to Eleuther HF org

Hosting wikitext on HuggingFace (#396)
* host wikitext on hf * transfer wikitext to Eleuther HF org
83dbfbf6 · ben · GitHub · 16ac11af · 16ac11af · 16ac11af
Unverified Commit 83dbfbf6 authored Mar 10, 2023 by ben Committed by GitHub Mar 10, 2023
4 changed files
--- a/lm_eval/datasets/wikitext/__init__.py
+++ b/lm_eval/datasets/wikitext/__init__.py
--- a/lm_eval/datasets/wikitext/dataset_infos.json
+++ b/lm_eval/datasets/wikitext/dataset_infos.json
-{"wikitext-103-v1": {"description": " The WikiText language modeling dataset is a collection of over 100 million tokens extracted from the set of verified\n Good and Featured articles on Wikipedia. The dataset is available under the Creative Commons Attribution-ShareAlike\n License.\n", "citation": "@misc{merity2016pointer,\n      title={Pointer Sentinel Mixture Models},\n      author={Stephen Merity and Caiming Xiong and James Bradbury and Richard Socher},\n      year={2016},\n      eprint={1609.07843},\n      archivePrefix={arXiv},\n      primaryClass={cs.CL}\n}\n", "homepage": "https://blog.einstein.ai/the-wikitext-long-term-dependency-language-modeling-dataset/", "license": "Creative Commons Attribution-ShareAlike 4.0 International (CC BY-SA 4.0)", "features": {"page": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "wikitext", "config_name": "wikitext-103-v1", "version": {"version_str": "1.0.0", "description": null, "major": 1, "minor": 0, "patch": 0}, "splits": {"test": {"name": "test", "num_bytes": 1281262, "num_examples": 62, "dataset_name": "wikitext"}, "train": {"name": "train", "num_bytes": 539297488, "num_examples": 29444, "dataset_name": "wikitext"}, "validation": {"name": "validation", "num_bytes": 1142488, "num_examples": 60, "dataset_name": "wikitext"}}, "download_checksums": {"https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-103-v1.zip": {"num_bytes": 190229076, "checksum": "242ba0f20b329cfdf1ccc61e9e9e5b59becf189db7f7a81cd2a0e2fc31539590"}}, "download_size": 190229076, "post_processing_size": null, "dataset_size": 541721238, "size_in_bytes": 731950314}, "wikitext-2-v1": {"description": " The WikiText language modeling dataset is a collection of over 100 million tokens extracted from the set of verified\n Good and Featured articles on Wikipedia. The dataset is available under the Creative Commons Attribution-ShareAlike\n License.\n", "citation": "@misc{merity2016pointer,\n      title={Pointer Sentinel Mixture Models},\n      author={Stephen Merity and Caiming Xiong and James Bradbury and Richard Socher},\n      year={2016},\n      eprint={1609.07843},\n      archivePrefix={arXiv},\n      primaryClass={cs.CL}\n}\n", "homepage": "https://blog.einstein.ai/the-wikitext-long-term-dependency-language-modeling-dataset/", "license": "Creative Commons Attribution-ShareAlike 4.0 International (CC BY-SA 4.0)", "features": {"page": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "wikitext", "config_name": "wikitext-2-v1", "version": {"version_str": "1.0.0", "description": null, "major": 1, "minor": 0, "patch": 0}, "splits": {"test": {"name": "test", "num_bytes": 1256634, "num_examples": 62, "dataset_name": "wikitext"}, "train": {"name": "train", "num_bytes": 10799034, "num_examples": 629, "dataset_name": "wikitext"}, "validation": {"name": "validation", "num_bytes": 1121860, "num_examples": 60, "dataset_name": "wikitext"}}, "download_checksums": {"https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-v1.zip": {"num_bytes": 4475746, "checksum": "92675f1d63015c1c8b51f1656a52d5bdbc33aafa60cc47a218a66e7ee817488c"}}, "download_size": 4475746, "post_processing_size": null, "dataset_size": 13177528, "size_in_bytes": 17653274}, "wikitext-103-raw-v1": {"description": " The WikiText language modeling dataset is a collection of over 100 million tokens extracted from the set of verified\n Good and Featured articles on Wikipedia. The dataset is available under the Creative Commons Attribution-ShareAlike\n License.\n", "citation": "@misc{merity2016pointer,\n      title={Pointer Sentinel Mixture Models},\n      author={Stephen Merity and Caiming Xiong and James Bradbury and Richard Socher},\n      year={2016},\n      eprint={1609.07843},\n      archivePrefix={arXiv},\n      primaryClass={cs.CL}\n}\n", "homepage": "https://blog.einstein.ai/the-wikitext-long-term-dependency-language-modeling-dataset/", "license": "Creative Commons Attribution-ShareAlike 4.0 International (CC BY-SA 4.0)", "features": {"page": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "wikitext", "config_name": "wikitext-103-raw-v1", "version": {"version_str": "1.0.0", "description": null, "major": 1, "minor": 0, "patch": 0}, "splits": {"test": {"name": "test", "num_bytes": 1290775, "num_examples": 62, "dataset_name": "wikitext"}, "train": {"name": "train", "num_bytes": 540656522, "num_examples": 29444, "dataset_name": "wikitext"}, "validation": {"name": "validation", "num_bytes": 1147025, "num_examples": 60, "dataset_name": "wikitext"}}, "download_checksums": {"https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-103-raw-v1.zip": {"num_bytes": 191984949, "checksum": "91c00ae287f0d699e18605c84afc9e45c192bc6b7797ff8837e5474655a33794"}}, "download_size": 191984949, "post_processing_size": null, "dataset_size": 543094322, "size_in_bytes": 735079271}, "wikitext-2-raw-v1": {"description": " The WikiText language modeling dataset is a collection of over 100 million tokens extracted from the set of verified\n Good and Featured articles on Wikipedia. The dataset is available under the Creative Commons Attribution-ShareAlike\n License.\n", "citation": "@misc{merity2016pointer,\n      title={Pointer Sentinel Mixture Models},\n      author={Stephen Merity and Caiming Xiong and James Bradbury and Richard Socher},\n      year={2016},\n      eprint={1609.07843},\n      archivePrefix={arXiv},\n      primaryClass={cs.CL}\n}\n", "homepage": "https://blog.einstein.ai/the-wikitext-long-term-dependency-language-modeling-dataset/", "license": "Creative Commons Attribution-ShareAlike 4.0 International (CC BY-SA 4.0)", "features": {"page": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "wikitext", "config_name": "wikitext-2-raw-v1", "version": {"version_str": "1.0.0", "description": null, "major": 1, "minor": 0, "patch": 0}, "splits": {"test": {"name": "test", "num_bytes": 1290775, "num_examples": 62, "dataset_name": "wikitext"}, "train": {"name": "train", "num_bytes": 10942633, "num_examples": 629, "dataset_name": "wikitext"}, "validation": {"name": "validation", "num_bytes": 1147025, "num_examples": 60, "dataset_name": "wikitext"}}, "download_checksums": {"https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-raw-v1.zip": {"num_bytes": 4721645, "checksum": "ef7edb566e3e2b2d31b29c1fdb0c89a4cc683597484c3dc2517919c615435a11"}}, "download_size": 4721645, "post_processing_size": null, "dataset_size": 13380433, "size_in_bytes": 18102078}}
--- a/lm_eval/datasets/wikitext/wikitext.py
+++ b/lm_eval/datasets/wikitext/wikitext.py
-# Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# NOTE: This is a modified version of https://github.com/huggingface/datasets/blob/master/datasets/wikitext/wikitext.py
-# that returns Wiki pages instead of Wiki text line-by-line.
-"""WikiText Dataset."""
-
-
-import os
-
-import datasets
-
-
-_CITATION = """\
-@misc{merity2016pointer,
-      title={Pointer Sentinel Mixture Models},
-      author={Stephen Merity and Caiming Xiong and James Bradbury and Richard Socher},
-      year={2016},
-      eprint={1609.07843},
-      archivePrefix={arXiv},
-      primaryClass={cs.CL}
-}
-"""
-
-_DESCRIPTION = """\
- The WikiText language modeling dataset is a collection of over 100 million tokens extracted from the set of verified
- Good and Featured articles on Wikipedia. The dataset is available under the Creative Commons Attribution-ShareAlike
- License.
-"""
-_HOMEPAGE = "https://blog.einstein.ai/the-wikitext-long-term-dependency-language-modeling-dataset/"
-_LICENSE = "Creative Commons Attribution-ShareAlike 4.0 International (CC BY-SA 4.0)"
-_DATA_URL = "https://s3.amazonaws.com/research.metamind.io/wikitext"
-
-
-class WikitextConfig(datasets.BuilderConfig):
-    """BuilderConfig for GLUE."""
-
-    def __init__(self, data_url, **kwargs):
-        """BuilderConfig for Wikitext
-        Args:
-          data_url: `string`, url to the dataset (word or raw level)
-          **kwargs: keyword arguments forwarded to super.
-        """
-        super(WikitextConfig, self).__init__(
-            version=datasets.Version(
-                "1.0.0",
-            ),
-            **kwargs,
-        )
-        self.data_url = data_url
-
-
-class Wikitext(datasets.GeneratorBasedBuilder):
-    """TODO(wikitext_103): Short description of my dataset."""
-
-    # TODO(wikitext_103): Set up version.
-    VERSION = datasets.Version("0.1.0")
-    BUILDER_CONFIGS = [
-        WikitextConfig(
-            name="wikitext-103-v1",
-            data_url=_DATA_URL + "/" + "wikitext-103-v1.zip",
-            description="Word level dataset. No processing is needed other than replacing newlines with <eos> tokens.",
-        ),
-        WikitextConfig(
-            name="wikitext-2-v1",
-            data_url=_DATA_URL + "/" + "wikitext-2-v1.zip",
-            description="Word level dataset. No processing is needed other than replacing newlines with <eos> tokens.",
-        ),
-        WikitextConfig(
-            name="wikitext-103-raw-v1",
-            data_url=_DATA_URL + "/" + "wikitext-103-raw-v1.zip",
-            description="Raw level dataset: the raw tokens before the addition of <unk> tokens. "
-            "They should only be used for character level work or for creating newly derived datasets.",
-        ),
-        WikitextConfig(
-            name="wikitext-2-raw-v1",
-            data_url=_DATA_URL + "/" + "wikitext-2-raw-v1.zip",
-            description="Raw level dataset: the raw tokens before the addition of <unk> tokens. "
-            "They should only be used for character level work or for creating newly derived datasets.",
-        ),
-    ]
-
-    def _info(self):
-        # TODO(wikitext): Specifies the datasets.DatasetInfo object
-        return datasets.DatasetInfo(
-            # This is the description that will appear on the datasets page.
-            description=_DESCRIPTION,
-            # datasets.features.FeatureConnectors
-            features=datasets.Features(
-                {
-                    "page": datasets.Value("string")
-                    # These are the features of your dataset like images, labels ...
-                }
-            ),
-            # If there's a common (input, target) tuple from the features,
-            # specify them here. They'll be used if as_supervised=True in
-            # builder.as_dataset.
-            supervised_keys=None,
-            homepage=_HOMEPAGE,
-            license=_LICENSE,
-            citation=_CITATION,
-        )
-
-    def _split_generators(self, dl_manager):
-        """Returns SplitGenerators."""
-        # TODO(wikitext): Downloads the data and defines the splits
-        # dl_manager is a datasets.download.DownloadManager that can be used to
-        # download and extract URLs
-        if self.config.name == "wikitext-103-v1":
-            data_file = dl_manager.download_and_extract(self.config.data_url)
-            data_dir = os.path.join(data_file, "wikitext-103")
-            return [
-                datasets.SplitGenerator(
-                    name=datasets.Split.TEST,
-                    gen_kwargs={
-                        "data_file": os.path.join(data_dir, "wiki.test.tokens"),
-                        "split": "test",
-                    },
-                ),
-                datasets.SplitGenerator(
-                    name=datasets.Split.TRAIN,
-                    gen_kwargs={
-                        "data_file": os.path.join(data_dir, "wiki.train.tokens"),
-                        "split": "train",
-                    },
-                ),
-                datasets.SplitGenerator(
-                    name=datasets.Split.VALIDATION,
-                    gen_kwargs={
-                        "data_file": os.path.join(data_dir, "wiki.valid.tokens"),
-                        "split": "valid",
-                    },
-                ),
-            ]
-        else:
-            if self.config.name == "wikitext-103-raw-v1":
-                data_file = dl_manager.download_and_extract(self.config.data_url)
-                data_dir = os.path.join(data_file, "wikitext-103-raw")
-                return [
-                    datasets.SplitGenerator(
-                        name=datasets.Split.TEST,
-                        gen_kwargs={
-                            "data_file": os.path.join(data_dir, "wiki.test.raw"),
-                            "split": "test",
-                        },
-                    ),
-                    datasets.SplitGenerator(
-                        name=datasets.Split.TRAIN,
-                        gen_kwargs={
-                            "data_file": os.path.join(data_dir, "wiki.train.raw"),
-                            "split": "train",
-                        },
-                    ),
-                    datasets.SplitGenerator(
-                        name=datasets.Split.VALIDATION,
-                        gen_kwargs={
-                            "data_file": os.path.join(data_dir, "wiki.valid.raw"),
-                            "split": "valid",
-                        },
-                    ),
-                ]
-            else:
-                if self.config.name == "wikitext-2-raw-v1":
-                    data_file = dl_manager.download_and_extract(self.config.data_url)
-                    data_dir = os.path.join(data_file, "wikitext-2-raw")
-                    return [
-                        datasets.SplitGenerator(
-                            name=datasets.Split.TEST,
-                            gen_kwargs={
-                                "data_file": os.path.join(data_dir, "wiki.test.raw"),
-                                "split": "test",
-                            },
-                        ),
-                        datasets.SplitGenerator(
-                            name=datasets.Split.TRAIN,
-                            gen_kwargs={
-                                "data_file": os.path.join(data_dir, "wiki.train.raw"),
-                                "split": "train",
-                            },
-                        ),
-                        datasets.SplitGenerator(
-                            name=datasets.Split.VALIDATION,
-                            gen_kwargs={
-                                "data_file": os.path.join(data_dir, "wiki.valid.raw"),
-                                "split": "valid",
-                            },
-                        ),
-                    ]
-                else:
-                    if self.config.name == "wikitext-2-v1":
-                        data_file = dl_manager.download_and_extract(
-                            self.config.data_url
-                        )
-                        data_dir = os.path.join(data_file, "wikitext-2")
-                        return [
-                            datasets.SplitGenerator(
-                                name=datasets.Split.TEST,
-                                gen_kwargs={
-                                    "data_file": os.path.join(
-                                        data_dir, "wiki.test.tokens"
-                                    ),
-                                    "split": "test",
-                                },
-                            ),
-                            datasets.SplitGenerator(
-                                name=datasets.Split.TRAIN,
-                                gen_kwargs={
-                                    "data_file": os.path.join(
-                                        data_dir, "wiki.train.tokens"
-                                    ),
-                                    "split": "train",
-                                },
-                            ),
-                            datasets.SplitGenerator(
-                                name=datasets.Split.VALIDATION,
-                                gen_kwargs={
-                                    "data_file": os.path.join(
-                                        data_dir, "wiki.valid.tokens"
-                                    ),
-                                    "split": "valid",
-                                },
-                            ),
-                        ]
-
-    def _generate_examples(self, data_file, split):
-        """Yields examples."""
-        with open(data_file, encoding="utf-8") as f:
-            key = 0
-            ret = []
-            data = f.read().split("\n")
-            for line in data:
-                rline = line.replace("= = =", "===").replace("= =", "==").strip()
-                if rline.startswith("= ") and rline.strip().endswith(" ="):
-                    page = "\n".join(ret)
-                    if page.strip():
-                        yield key, {"page": page}
-                        key += 1
-                    ret = []
-                ret.append(line)
-            page = "\n".join(ret)
-            yield key, {"page": page}
--- a/lm_eval/tasks/wikitext.py
+++ b/lm_eval/tasks/wikitext.py
@@ -10,8 +10,6 @@ NOTE: This `Task` is based on WikiText-2.
 Homepage: https://www.salesforce.com/products/einstein/ai-research/the-wikitext-dependency-language-modeling-dataset/
 """
 import re
-import inspect
-import lm_eval.datasets.wikitext.wikitext
 from lm_eval.base import PerplexityTask


@@ -63,7 +61,7 @@ def wikitext_detokenizer(string):

 class WikiText(PerplexityTask):
    VERSION = 1
-    DATASET_PATH = inspect.getfile(lm_eval.datasets.wikitext.wikitext)
+    DATASET_PATH = "EleutherAI/wikitext_document_level"
    DATASET_NAME = "wikitext-2-raw-v1"

    def has_training_docs(self):