"git@developer.sourcefind.cn:gaoqiong/migraphx.git" did not exist on "e5dc96293dd6f0d651446059d2aacebd11772033"
Unverified Commit d6ceced5 authored by Stella Biderman's avatar Stella Biderman Committed by GitHub
Browse files

Merge branch 'master' into auto-batching

parents 4d21ab6b fc4428dc
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
{"wikitext-103-v1": {"description": " The WikiText language modeling dataset is a collection of over 100 million tokens extracted from the set of verified\n Good and Featured articles on Wikipedia. The dataset is available under the Creative Commons Attribution-ShareAlike\n License.\n", "citation": "@misc{merity2016pointer,\n title={Pointer Sentinel Mixture Models},\n author={Stephen Merity and Caiming Xiong and James Bradbury and Richard Socher},\n year={2016},\n eprint={1609.07843},\n archivePrefix={arXiv},\n primaryClass={cs.CL}\n}\n", "homepage": "https://blog.einstein.ai/the-wikitext-long-term-dependency-language-modeling-dataset/", "license": "Creative Commons Attribution-ShareAlike 4.0 International (CC BY-SA 4.0)", "features": {"page": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "wikitext", "config_name": "wikitext-103-v1", "version": {"version_str": "1.0.0", "description": null, "major": 1, "minor": 0, "patch": 0}, "splits": {"test": {"name": "test", "num_bytes": 1281262, "num_examples": 62, "dataset_name": "wikitext"}, "train": {"name": "train", "num_bytes": 539297488, "num_examples": 29444, "dataset_name": "wikitext"}, "validation": {"name": "validation", "num_bytes": 1142488, "num_examples": 60, "dataset_name": "wikitext"}}, "download_checksums": {"https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-103-v1.zip": {"num_bytes": 190229076, "checksum": "242ba0f20b329cfdf1ccc61e9e9e5b59becf189db7f7a81cd2a0e2fc31539590"}}, "download_size": 190229076, "post_processing_size": null, "dataset_size": 541721238, "size_in_bytes": 731950314}, "wikitext-2-v1": {"description": " The WikiText language modeling dataset is a collection of over 100 million tokens extracted from the set of verified\n Good and Featured articles on Wikipedia. The dataset is available under the Creative Commons Attribution-ShareAlike\n License.\n", "citation": "@misc{merity2016pointer,\n title={Pointer Sentinel Mixture Models},\n author={Stephen Merity and Caiming Xiong and James Bradbury and Richard Socher},\n year={2016},\n eprint={1609.07843},\n archivePrefix={arXiv},\n primaryClass={cs.CL}\n}\n", "homepage": "https://blog.einstein.ai/the-wikitext-long-term-dependency-language-modeling-dataset/", "license": "Creative Commons Attribution-ShareAlike 4.0 International (CC BY-SA 4.0)", "features": {"page": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "wikitext", "config_name": "wikitext-2-v1", "version": {"version_str": "1.0.0", "description": null, "major": 1, "minor": 0, "patch": 0}, "splits": {"test": {"name": "test", "num_bytes": 1256634, "num_examples": 62, "dataset_name": "wikitext"}, "train": {"name": "train", "num_bytes": 10799034, "num_examples": 629, "dataset_name": "wikitext"}, "validation": {"name": "validation", "num_bytes": 1121860, "num_examples": 60, "dataset_name": "wikitext"}}, "download_checksums": {"https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-v1.zip": {"num_bytes": 4475746, "checksum": "92675f1d63015c1c8b51f1656a52d5bdbc33aafa60cc47a218a66e7ee817488c"}}, "download_size": 4475746, "post_processing_size": null, "dataset_size": 13177528, "size_in_bytes": 17653274}, "wikitext-103-raw-v1": {"description": " The WikiText language modeling dataset is a collection of over 100 million tokens extracted from the set of verified\n Good and Featured articles on Wikipedia. The dataset is available under the Creative Commons Attribution-ShareAlike\n License.\n", "citation": "@misc{merity2016pointer,\n title={Pointer Sentinel Mixture Models},\n author={Stephen Merity and Caiming Xiong and James Bradbury and Richard Socher},\n year={2016},\n eprint={1609.07843},\n archivePrefix={arXiv},\n primaryClass={cs.CL}\n}\n", "homepage": "https://blog.einstein.ai/the-wikitext-long-term-dependency-language-modeling-dataset/", "license": "Creative Commons Attribution-ShareAlike 4.0 International (CC BY-SA 4.0)", "features": {"page": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "wikitext", "config_name": "wikitext-103-raw-v1", "version": {"version_str": "1.0.0", "description": null, "major": 1, "minor": 0, "patch": 0}, "splits": {"test": {"name": "test", "num_bytes": 1290775, "num_examples": 62, "dataset_name": "wikitext"}, "train": {"name": "train", "num_bytes": 540656522, "num_examples": 29444, "dataset_name": "wikitext"}, "validation": {"name": "validation", "num_bytes": 1147025, "num_examples": 60, "dataset_name": "wikitext"}}, "download_checksums": {"https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-103-raw-v1.zip": {"num_bytes": 191984949, "checksum": "91c00ae287f0d699e18605c84afc9e45c192bc6b7797ff8837e5474655a33794"}}, "download_size": 191984949, "post_processing_size": null, "dataset_size": 543094322, "size_in_bytes": 735079271}, "wikitext-2-raw-v1": {"description": " The WikiText language modeling dataset is a collection of over 100 million tokens extracted from the set of verified\n Good and Featured articles on Wikipedia. The dataset is available under the Creative Commons Attribution-ShareAlike\n License.\n", "citation": "@misc{merity2016pointer,\n title={Pointer Sentinel Mixture Models},\n author={Stephen Merity and Caiming Xiong and James Bradbury and Richard Socher},\n year={2016},\n eprint={1609.07843},\n archivePrefix={arXiv},\n primaryClass={cs.CL}\n}\n", "homepage": "https://blog.einstein.ai/the-wikitext-long-term-dependency-language-modeling-dataset/", "license": "Creative Commons Attribution-ShareAlike 4.0 International (CC BY-SA 4.0)", "features": {"page": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "wikitext", "config_name": "wikitext-2-raw-v1", "version": {"version_str": "1.0.0", "description": null, "major": 1, "minor": 0, "patch": 0}, "splits": {"test": {"name": "test", "num_bytes": 1290775, "num_examples": 62, "dataset_name": "wikitext"}, "train": {"name": "train", "num_bytes": 10942633, "num_examples": 629, "dataset_name": "wikitext"}, "validation": {"name": "validation", "num_bytes": 1147025, "num_examples": 60, "dataset_name": "wikitext"}}, "download_checksums": {"https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-raw-v1.zip": {"num_bytes": 4721645, "checksum": "ef7edb566e3e2b2d31b29c1fdb0c89a4cc683597484c3dc2517919c615435a11"}}, "download_size": 4721645, "post_processing_size": null, "dataset_size": 13380433, "size_in_bytes": 18102078}}
# Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# NOTE: This is a modified version of https://github.com/huggingface/datasets/blob/master/datasets/wikitext/wikitext.py
# that returns Wiki pages instead of Wiki text line-by-line.
"""WikiText Dataset."""
import os
import datasets
_CITATION = """\
@misc{merity2016pointer,
title={Pointer Sentinel Mixture Models},
author={Stephen Merity and Caiming Xiong and James Bradbury and Richard Socher},
year={2016},
eprint={1609.07843},
archivePrefix={arXiv},
primaryClass={cs.CL}
}
"""
_DESCRIPTION = """\
The WikiText language modeling dataset is a collection of over 100 million tokens extracted from the set of verified
Good and Featured articles on Wikipedia. The dataset is available under the Creative Commons Attribution-ShareAlike
License.
"""
_HOMEPAGE = "https://blog.einstein.ai/the-wikitext-long-term-dependency-language-modeling-dataset/"
_LICENSE = "Creative Commons Attribution-ShareAlike 4.0 International (CC BY-SA 4.0)"
_DATA_URL = "https://s3.amazonaws.com/research.metamind.io/wikitext"
class WikitextConfig(datasets.BuilderConfig):
"""BuilderConfig for GLUE."""
def __init__(self, data_url, **kwargs):
"""BuilderConfig for Wikitext
Args:
data_url: `string`, url to the dataset (word or raw level)
**kwargs: keyword arguments forwarded to super.
"""
super(WikitextConfig, self).__init__(
version=datasets.Version(
"1.0.0",
),
**kwargs,
)
self.data_url = data_url
class Wikitext(datasets.GeneratorBasedBuilder):
"""TODO(wikitext_103): Short description of my dataset."""
# TODO(wikitext_103): Set up version.
VERSION = datasets.Version("0.1.0")
BUILDER_CONFIGS = [
WikitextConfig(
name="wikitext-103-v1",
data_url=_DATA_URL + "/" + "wikitext-103-v1.zip",
description="Word level dataset. No processing is needed other than replacing newlines with <eos> tokens.",
),
WikitextConfig(
name="wikitext-2-v1",
data_url=_DATA_URL + "/" + "wikitext-2-v1.zip",
description="Word level dataset. No processing is needed other than replacing newlines with <eos> tokens.",
),
WikitextConfig(
name="wikitext-103-raw-v1",
data_url=_DATA_URL + "/" + "wikitext-103-raw-v1.zip",
description="Raw level dataset: the raw tokens before the addition of <unk> tokens. "
"They should only be used for character level work or for creating newly derived datasets.",
),
WikitextConfig(
name="wikitext-2-raw-v1",
data_url=_DATA_URL + "/" + "wikitext-2-raw-v1.zip",
description="Raw level dataset: the raw tokens before the addition of <unk> tokens. "
"They should only be used for character level work or for creating newly derived datasets.",
),
]
def _info(self):
# TODO(wikitext): Specifies the datasets.DatasetInfo object
return datasets.DatasetInfo(
# This is the description that will appear on the datasets page.
description=_DESCRIPTION,
# datasets.features.FeatureConnectors
features=datasets.Features(
{
"page": datasets.Value("string")
# These are the features of your dataset like images, labels ...
}
),
# If there's a common (input, target) tuple from the features,
# specify them here. They'll be used if as_supervised=True in
# builder.as_dataset.
supervised_keys=None,
homepage=_HOMEPAGE,
license=_LICENSE,
citation=_CITATION,
)
def _split_generators(self, dl_manager):
"""Returns SplitGenerators."""
# TODO(wikitext): Downloads the data and defines the splits
# dl_manager is a datasets.download.DownloadManager that can be used to
# download and extract URLs
if self.config.name == "wikitext-103-v1":
data_file = dl_manager.download_and_extract(self.config.data_url)
data_dir = os.path.join(data_file, "wikitext-103")
return [
datasets.SplitGenerator(
name=datasets.Split.TEST,
gen_kwargs={
"data_file": os.path.join(data_dir, "wiki.test.tokens"),
"split": "test",
},
),
datasets.SplitGenerator(
name=datasets.Split.TRAIN,
gen_kwargs={
"data_file": os.path.join(data_dir, "wiki.train.tokens"),
"split": "train",
},
),
datasets.SplitGenerator(
name=datasets.Split.VALIDATION,
gen_kwargs={
"data_file": os.path.join(data_dir, "wiki.valid.tokens"),
"split": "valid",
},
),
]
else:
if self.config.name == "wikitext-103-raw-v1":
data_file = dl_manager.download_and_extract(self.config.data_url)
data_dir = os.path.join(data_file, "wikitext-103-raw")
return [
datasets.SplitGenerator(
name=datasets.Split.TEST,
gen_kwargs={
"data_file": os.path.join(data_dir, "wiki.test.raw"),
"split": "test",
},
),
datasets.SplitGenerator(
name=datasets.Split.TRAIN,
gen_kwargs={
"data_file": os.path.join(data_dir, "wiki.train.raw"),
"split": "train",
},
),
datasets.SplitGenerator(
name=datasets.Split.VALIDATION,
gen_kwargs={
"data_file": os.path.join(data_dir, "wiki.valid.raw"),
"split": "valid",
},
),
]
else:
if self.config.name == "wikitext-2-raw-v1":
data_file = dl_manager.download_and_extract(self.config.data_url)
data_dir = os.path.join(data_file, "wikitext-2-raw")
return [
datasets.SplitGenerator(
name=datasets.Split.TEST,
gen_kwargs={
"data_file": os.path.join(data_dir, "wiki.test.raw"),
"split": "test",
},
),
datasets.SplitGenerator(
name=datasets.Split.TRAIN,
gen_kwargs={
"data_file": os.path.join(data_dir, "wiki.train.raw"),
"split": "train",
},
),
datasets.SplitGenerator(
name=datasets.Split.VALIDATION,
gen_kwargs={
"data_file": os.path.join(data_dir, "wiki.valid.raw"),
"split": "valid",
},
),
]
else:
if self.config.name == "wikitext-2-v1":
data_file = dl_manager.download_and_extract(
self.config.data_url
)
data_dir = os.path.join(data_file, "wikitext-2")
return [
datasets.SplitGenerator(
name=datasets.Split.TEST,
gen_kwargs={
"data_file": os.path.join(
data_dir, "wiki.test.tokens"
),
"split": "test",
},
),
datasets.SplitGenerator(
name=datasets.Split.TRAIN,
gen_kwargs={
"data_file": os.path.join(
data_dir, "wiki.train.tokens"
),
"split": "train",
},
),
datasets.SplitGenerator(
name=datasets.Split.VALIDATION,
gen_kwargs={
"data_file": os.path.join(
data_dir, "wiki.valid.tokens"
),
"split": "valid",
},
),
]
def _generate_examples(self, data_file, split):
"""Yields examples."""
with open(data_file, encoding="utf-8") as f:
key = 0
ret = []
data = f.read().split("\n")
for line in data:
rline = line.replace("= = =", "===").replace("= =", "==").strip()
if rline.startswith("= ") and rline.strip().endswith(" ="):
page = "\n".join(ret)
if page.strip():
yield key, {"page": page}
key += 1
ret = []
ret.append(line)
page = "\n".join(ret)
yield key, {"page": page}
...@@ -6,7 +6,8 @@ from . import dummy ...@@ -6,7 +6,8 @@ from . import dummy
MODEL_REGISTRY = { MODEL_REGISTRY = {
"hf": gpt2.HFLM, "hf": gpt2.HFLM,
"hf-causal": huggingface.AutoCausalLM, "hf-causal": gpt2.HFLM,
"hf-causal-experimental": huggingface.AutoCausalLM,
"hf-seq2seq": huggingface.AutoSeq2SeqLM, "hf-seq2seq": huggingface.AutoSeq2SeqLM,
"gpt2": gpt2.GPT2LM, "gpt2": gpt2.GPT2LM,
"gpt3": gpt3.GPT3LM, "gpt3": gpt3.GPT3LM,
......
import torch import torch
import transformers import transformers
from typing import Optional
from lm_eval.base import BaseLM from lm_eval.base import BaseLM
class HFLM(BaseLM): class HFLM(BaseLM):
...@@ -12,6 +13,8 @@ class HFLM(BaseLM): ...@@ -12,6 +13,8 @@ class HFLM(BaseLM):
subfolder=None, subfolder=None,
tokenizer=None, tokenizer=None,
batch_size=1, batch_size=1,
load_in_8bit: Optional[bool] = False,
trust_remote_code: Optional[bool] = False,
): ):
super().__init__() super().__init__()
...@@ -19,9 +22,8 @@ class HFLM(BaseLM): ...@@ -19,9 +22,8 @@ class HFLM(BaseLM):
assert isinstance(pretrained, str) assert isinstance(pretrained, str)
assert isinstance(batch_size, (int,str)) assert isinstance(batch_size, (int,str))
if device: device_list = set(["cuda", "cpu"] + [f'cuda:{i}' for i in range(torch.cuda.device_count())])
if device not in ["cuda", "cpu"]: if device and device in device_list:
device = int(device)
self._device = torch.device(device) self._device = torch.device(device)
print(f"Using device '{device}'") print(f"Using device '{device}'")
else: else:
...@@ -37,25 +39,20 @@ class HFLM(BaseLM): ...@@ -37,25 +39,20 @@ class HFLM(BaseLM):
revision = revision + ("/" + subfolder if subfolder is not None else "") revision = revision + ("/" + subfolder if subfolder is not None else "")
self.gpt2 = transformers.AutoModelForCausalLM.from_pretrained( self.gpt2 = transformers.AutoModelForCausalLM.from_pretrained(
pretrained, revision=revision, low_cpu_mem_usage=low_cpu_mem_usage pretrained,
load_in_8bit=load_in_8bit,
low_cpu_mem_usage=low_cpu_mem_usage,
revision=revision,
trust_remote_code=trust_remote_code,
).to(self.device) ).to(self.device)
self.gpt2.eval() self.gpt2.eval()
self.tokenizer = transformers.AutoTokenizer.from_pretrained( self.tokenizer = transformers.AutoTokenizer.from_pretrained(
pretrained if tokenizer is None else tokenizer, pretrained if tokenizer is None else tokenizer,
revision=revision, revision=revision,
trust_remote_code=trust_remote_code,
) )
assert isinstance(
self.tokenizer,
(
transformers.GPT2Tokenizer,
transformers.GPT2TokenizerFast,
transformers.T5Tokenizer,
transformers.T5TokenizerFast,
),
), "this tokenizer has not been checked for compatibility yet!"
self.vocab_size = self.tokenizer.vocab_size self.vocab_size = self.tokenizer.vocab_size
if isinstance( if isinstance(
...@@ -74,7 +71,6 @@ class HFLM(BaseLM): ...@@ -74,7 +71,6 @@ class HFLM(BaseLM):
else: else:
self.batch_size_per_gpu = int(batch_size) self.batch_size_per_gpu = int(batch_size)
@property @property
def eot_token_id(self): def eot_token_id(self):
# we use EOT because end of *text* is more accurate for what we're doing than end of *sentence* # we use EOT because end of *text* is more accurate for what we're doing than end of *sentence*
...@@ -120,9 +116,10 @@ class HFLM(BaseLM): ...@@ -120,9 +116,10 @@ class HFLM(BaseLM):
return self.gpt2(inps)[0] return self.gpt2(inps)[0]
def _model_generate(self, context, max_length, eos_token_id): def _model_generate(self, context, max_length, eos_token_id):
return self.gpt2.generate( generation_kwargs = {'do_sample': False, 'max_length': max_length}
context, max_length=max_length, eos_token_id=eos_token_id, do_sample=False if eos_token_id is not None:
) generation_kwargs['eos_token_id'] = eos_token_id
return self.gpt2.generate(context, **generation_kwargs)
# for backwards compatibility # for backwards compatibility
......
This diff is collapsed.
...@@ -123,7 +123,8 @@ class TextSynthLM(BaseLM): ...@@ -123,7 +123,8 @@ class TextSynthLM(BaseLM):
res = [] res = []
for request in tqdm(requests): for request in tqdm(requests):
inp = request[0] inp = request[0]
until = request[1] request_args = request[1]
until = request_args['until']
response = textsynth_completion( response = textsynth_completion(
url=self.api_url + "/v1/engines/" + self.engine + "/completions", url=self.api_url + "/v1/engines/" + self.engine + "/completions",
headers={"Authorization": "Bearer " + self.api_key}, headers={"Authorization": "Bearer " + self.api_key},
......
...@@ -52,6 +52,7 @@ from . import gsm8k ...@@ -52,6 +52,7 @@ from . import gsm8k
from . import storycloze from . import storycloze
from . import toxigen from . import toxigen
from . import crowspairs from . import crowspairs
from . import bigbench
######################################## ########################################
# Translation tasks # Translation tasks
...@@ -310,6 +311,7 @@ TASK_REGISTRY = { ...@@ -310,6 +311,7 @@ TASK_REGISTRY = {
# "storycloze_2016": storycloze.StoryCloze2016, # "storycloze_2016": storycloze.StoryCloze2016,
# "storycloze_2018": storycloze.StoryCloze2018, # "storycloze_2018": storycloze.StoryCloze2018,
# "sat": sat.SATAnalogies, # "sat": sat.SATAnalogies,
**bigbench.create_all_tasks(),
} }
......
...@@ -7,8 +7,6 @@ problem in natural language. ...@@ -7,8 +7,6 @@ problem in natural language.
Homepage: https://github.com/openai/gpt-3/tree/master/data Homepage: https://github.com/openai/gpt-3/tree/master/data
""" """
import inspect
import lm_eval.datasets.arithmetic.arithmetic
from lm_eval.base import Task, rf from lm_eval.base import Task, rf
from lm_eval.metrics import mean from lm_eval.metrics import mean
...@@ -30,7 +28,7 @@ _CITATION = """ ...@@ -30,7 +28,7 @@ _CITATION = """
class Arithmetic(Task): class Arithmetic(Task):
VERSION = 0 VERSION = 0
DATASET_PATH = inspect.getfile(lm_eval.datasets.arithmetic.arithmetic) DATASET_PATH = "EleutherAI/arithmetic"
def has_training_docs(self): def has_training_docs(self):
return False return False
......
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment