Unverified Commit 5a766ac5 authored by farzanehnakhaee70's avatar farzanehnakhaee70 Committed by GitHub
Browse files

Merge branch 'big-refactor' into big-refactor

parents a0c1dbbd 01cfb2ff
"""
The Pile: An 800GB Dataset of Diverse Text for Language Modeling
https://arxiv.org/pdf/2101.00027.pdf
The Pile is a 825 GiB diverse, open source language modelling data set that consists
of 22 smaller, high-quality datasets combined together. To score well on Pile
BPB (bits per byte), a model must be able to understand many disparate domains
including books, github repositories, webpages, chat logs, and medical, physics,
math, computer science, and philosophy papers.
Homepage: https://pile.eleuther.ai/
"""
from lm_eval.api.task import PerplexityTask
from lm_eval.api.register import register_task, register_group
_CITATION = """
@article{pile,
title={The {P}ile: An 800GB Dataset of Diverse Text for Language Modeling},
author={Gao, Leo and Biderman, Stella and Black, Sid and Golding, Laurence and Hoppe, Travis and Foster, Charles and Phang, Jason and He, Horace and Thite, Anish and Nabeshima, Noa and Presser, Shawn and Leahy, Connor},
journal={arXiv preprint arXiv:2101.00027},
year={2020}
}
"""
class PilePerplexityTask(PerplexityTask):
VERSION = "2.0"
DATASET_PATH = "EleutherAI/the_pile"
DATASET_NAME = None
def has_training_docs(self):
return False
def test_docs(self):
for doc in self.dataset["train"].select(range(100)):
yield doc
def has_validation_docs(self):
return False
def has_test_docs(self):
return True
def doc_to_target(self, doc):
return doc["text"]
# def validation_docs(self):
# for doc in self.dataset["validation"]:
# yield doc["text"]
# def test_docs(self):
# for doc in self.dataset["test"]:
# yield doc["text"]
class PileArxiv(PilePerplexityTask):
DATASET_NAME = "pile_arxiv"
class PileBooks3(PilePerplexityTask):
DATASET_NAME = "pile_books3"
class PileBookCorpus2(PilePerplexityTask):
DATASET_NAME = "pile_bookcorpus2"
class PileDmMathematics(PilePerplexityTask):
DATASET_NAME = "pile_dm-mathematics"
@register_task("pile_enron")
class PileEnron(PilePerplexityTask):
DATASET_NAME = "enron_emails"
class PileEuroparl(PilePerplexityTask):
DATASET_NAME = "pile_europarl"
class PileFreeLaw(PilePerplexityTask):
DATASET_NAME = "pile_freelaw"
class PileGithub(PilePerplexityTask):
DATASET_NAME = "pile_github"
class PileGutenberg(PilePerplexityTask):
DATASET_NAME = "pile_gutenberg"
class PileHackernews(PilePerplexityTask):
DATASET_NAME = "pile_hackernews"
class PileNIHExporter(PilePerplexityTask):
DATASET_NAME = "pile_nih-exporter"
class PileOpenSubtitles(PilePerplexityTask):
DATASET_NAME = "pile_opensubtitles"
class PileOpenWebText2(PilePerplexityTask):
DATASET_NAME = "pile_openwebtext2"
class PilePhilPapers(PilePerplexityTask):
DATASET_NAME = "pile_philpapers"
class PilePileCc(PilePerplexityTask):
DATASET_NAME = "pile_pile-cc"
class PilePubmedAbstracts(PilePerplexityTask):
DATASET_NAME = "pile_pubmed-abstracts"
class PilePubmedCentral(PilePerplexityTask):
DATASET_NAME = "pile_pubmed-central"
class PileStackExchange(PilePerplexityTask):
DATASET_NAME = "pile_stackexchange"
class PileUspto(PilePerplexityTask):
DATASET_NAME = "pile_upsto"
class PileUbuntuIrc(PilePerplexityTask):
DATASET_NAME = "pile_ubuntu-irc"
class PileWikipedia(PilePerplexityTask):
DATASET_NAME = "pile_wikipedia"
class PileYoutubeSubtitles(PilePerplexityTask):
DATASET_NAME = "pile_youtubesubtitles"
......@@ -20,4 +20,4 @@ Homepage: https://pile.eleuther.ai/
journal={arXiv preprint arXiv:2101.00027},
year={2020}
}
```
\ No newline at end of file
```
group:
- pile
- perplexity
- loglikelihood_rolling
task: pile_arxiv
dataset_path: EleutherAI/the_pile
dataset_name: pile_arxiv
......@@ -19,4 +21,4 @@ metric_list:
higher_is_better: false
- metric: bits_per_byte
aggregation: bits_per_byte
higher_is_better: false
\ No newline at end of file
higher_is_better: false
group:
- pile
include: pile_arxiv.yaml
task: pile_bookcorpus2
dataset_path: EleutherAI/the_pile
dataset_name: pile_bookcorpus2
output_type: loglikelihood_rolling
test_split: train
template_aliases: ""
doc_to_text: ""
doc_to_target: "{{text}}"
should_decontaminate: true
doc_to_decontamination_query: "{{text}}"
metric_list:
- metric: word_perplexity
aggregation: weighted_perplexity
higher_is_better: false
- metric: byte_perplexity
aggregation: weighted_perplexity
higher_is_better: false
- metric: bits_per_byte
aggregation: bits_per_byte
higher_is_better: false
\ No newline at end of file
group:
- pile
include: pile_arxiv.yaml
task: pile_books3
dataset_path: EleutherAI/the_pile
dataset_name: pile_books3
output_type: loglikelihood_rolling
test_split: train
template_aliases: ""
doc_to_text: ""
doc_to_target: "{{text}}"
should_decontaminate: true
doc_to_decontamination_query: "{{text}}"
metric_list:
- metric: word_perplexity
aggregation: weighted_perplexity
higher_is_better: false
- metric: byte_perplexity
aggregation: weighted_perplexity
higher_is_better: false
- metric: bits_per_byte
aggregation: bits_per_byte
higher_is_better: false
\ No newline at end of file
group:
- pile
include: pile_arxiv.yaml
task: pile_dm-mathematics
dataset_path: EleutherAI/the_pile
dataset_name: pile_dm-mathematics
output_type: loglikelihood_rolling
test_split: train
template_aliases: ""
doc_to_text: ""
doc_to_target: "{{text}}"
should_decontaminate: true
doc_to_decontamination_query: "{{text}}"
metric_list:
- metric: word_perplexity
aggregation: weighted_perplexity
higher_is_better: false
- metric: byte_perplexity
aggregation: weighted_perplexity
higher_is_better: false
- metric: bits_per_byte
aggregation: bits_per_byte
higher_is_better: false
\ No newline at end of file
group:
- pile
include: pile_arxiv.yaml
task: pile_enron
dataset_path: EleutherAI/the_pile
dataset_name: enron_emails
output_type: loglikelihood_rolling
test_split: train
template_aliases: ""
doc_to_text: ""
doc_to_target: "{{text}}"
should_decontaminate: true
doc_to_decontamination_query: "{{text}}"
metric_list:
- metric: word_perplexity
aggregation: weighted_perplexity
higher_is_better: false
- metric: byte_perplexity
aggregation: weighted_perplexity
higher_is_better: false
- metric: bits_per_byte
aggregation: bits_per_byte
higher_is_better: false
dataset_name: pile_enron
group:
- pile
include: pile_arxiv.yaml
task: pile_europarl
dataset_path: EleutherAI/the_pile
dataset_name: pile_europarl
output_type: loglikelihood_rolling
test_split: train
template_aliases: ""
doc_to_text: ""
doc_to_target: "{{text}}"
should_decontaminate: true
doc_to_decontamination_query: "{{text}}"
metric_list:
- metric: word_perplexity
aggregation: weighted_perplexity
higher_is_better: false
- metric: byte_perplexity
aggregation: weighted_perplexity
higher_is_better: false
- metric: bits_per_byte
aggregation: bits_per_byte
higher_is_better: false
\ No newline at end of file
dataset_name: pile_europarl
\ No newline at end of file
group:
- pile
include: pile_arxiv.yaml
task: pile_freelaw
dataset_path: EleutherAI/the_pile
dataset_name: pile_freelaw
output_type: loglikelihood_rolling
test_split: train
template_aliases: ""
doc_to_text: ""
doc_to_target: "{{text}}"
should_decontaminate: true
doc_to_decontamination_query: "{{text}}"
metric_list:
- metric: word_perplexity
aggregation: weighted_perplexity
higher_is_better: false
- metric: byte_perplexity
aggregation: weighted_perplexity
higher_is_better: false
- metric: bits_per_byte
aggregation: bits_per_byte
higher_is_better: false
\ No newline at end of file
group:
- pile
include: pile_arxiv.yaml
task: pile_github
dataset_path: EleutherAI/the_pile
dataset_name: pile_github
output_type: loglikelihood_rolling
test_split: train
template_aliases: ""
doc_to_text: ""
doc_to_target: "{{text}}"
should_decontaminate: true
doc_to_decontamination_query: "{{text}}"
metric_list:
- metric: word_perplexity
aggregation: weighted_perplexity
higher_is_better: false
- metric: byte_perplexity
aggregation: weighted_perplexity
higher_is_better: false
- metric: bits_per_byte
aggregation: bits_per_byte
higher_is_better: false
\ No newline at end of file
group:
- pile
include: pile_arxiv.yaml
task: pile_gutenberg
dataset_path: EleutherAI/the_pile
dataset_name: pile_gutenberg
output_type: loglikelihood_rolling
test_split: train
template_aliases: ""
doc_to_text: ""
doc_to_target: "{{text}}"
should_decontaminate: true
doc_to_decontamination_query: "{{text}}"
metric_list:
- metric: word_perplexity
aggregation: weighted_perplexity
higher_is_better: false
- metric: byte_perplexity
aggregation: weighted_perplexity
higher_is_better: false
- metric: bits_per_byte
aggregation: bits_per_byte
higher_is_better: false
\ No newline at end of file
dataset_name: pile_gutenberg
\ No newline at end of file
group:
- pile
include: pile_arxiv.yaml
task: pile_hackernews
dataset_path: EleutherAI/the_pile
dataset_name: pile_hackernews
output_type: loglikelihood_rolling
test_split: train
template_aliases: ""
doc_to_text: ""
doc_to_target: "{{text}}"
should_decontaminate: true
doc_to_decontamination_query: "{{text}}"
metric_list:
- metric: word_perplexity
aggregation: weighted_perplexity
higher_is_better: false
- metric: byte_perplexity
aggregation: weighted_perplexity
higher_is_better: false
- metric: bits_per_byte
aggregation: bits_per_byte
higher_is_better: false
\ No newline at end of file
group:
- pile
include: pile_arxiv.yaml
task: pile_nih-exporter
dataset_path: EleutherAI/the_pile
dataset_name: pile_nih-exporter
output_type: loglikelihood_rolling
test_split: train
template_aliases: ""
doc_to_text: ""
doc_to_target: "{{text}}"
should_decontaminate: true
doc_to_decontamination_query: "{{text}}"
metric_list:
- metric: word_perplexity
aggregation: weighted_perplexity
higher_is_better: false
- metric: byte_perplexity
aggregation: weighted_perplexity
higher_is_better: false
- metric: bits_per_byte
aggregation: bits_per_byte
higher_is_better: false
\ No newline at end of file
group:
- pile
include: pile_arxiv.yaml
task: pile_opensubtitles
dataset_path: EleutherAI/the_pile
dataset_name: pile_opensubtitles
output_type: loglikelihood_rolling
test_split: train
template_aliases: ""
doc_to_text: ""
doc_to_target: "{{text}}"
should_decontaminate: true
doc_to_decontamination_query: "{{text}}"
metric_list:
- metric: word_perplexity
aggregation: weighted_perplexity
higher_is_better: false
- metric: byte_perplexity
aggregation: weighted_perplexity
higher_is_better: false
- metric: bits_per_byte
aggregation: bits_per_byte
higher_is_better: false
\ No newline at end of file
group:
- pile
include: pile_arxiv.yaml
task: pile_openwebtext2
dataset_path: EleutherAI/the_pile
dataset_name: pile_openwebtext2
output_type: loglikelihood_rolling
test_split: train
template_aliases: ""
doc_to_text: ""
doc_to_target: "{{text}}"
should_decontaminate: true
doc_to_decontamination_query: "{{text}}"
metric_list:
- metric: word_perplexity
aggregation: weighted_perplexity
higher_is_better: false
- metric: byte_perplexity
aggregation: weighted_perplexity
higher_is_better: false
- metric: bits_per_byte
aggregation: bits_per_byte
higher_is_better: false
\ No newline at end of file
group:
- pile
include: pile_arxiv.yaml
task: pile_philpapers
dataset_path: EleutherAI/the_pile
dataset_name: pile_philpapers
output_type: loglikelihood_rolling
test_split: train
template_aliases: ""
doc_to_text: ""
doc_to_target: "{{text}}"
should_decontaminate: true
doc_to_decontamination_query: "{{text}}"
metric_list:
- metric: word_perplexity
aggregation: weighted_perplexity
higher_is_better: false
- metric: byte_perplexity
aggregation: weighted_perplexity
higher_is_better: false
- metric: bits_per_byte
aggregation: bits_per_byte
higher_is_better: false
\ No newline at end of file
group:
- pile
include: pile_arxiv.yaml
task: pile_pile-cc
dataset_path: EleutherAI/the_pile
dataset_name: pile_pile-cc
output_type: loglikelihood_rolling
test_split: train
template_aliases: ""
doc_to_text: ""
doc_to_target: "{{text}}"
should_decontaminate: true
doc_to_decontamination_query: "{{text}}"
metric_list:
- metric: word_perplexity
aggregation: weighted_perplexity
higher_is_better: false
- metric: byte_perplexity
aggregation: weighted_perplexity
higher_is_better: false
- metric: bits_per_byte
aggregation: bits_per_byte
higher_is_better: false
\ No newline at end of file
group:
- pile
include: pile_arxiv.yaml
task: pile_pubmed-abstracts
dataset_path: EleutherAI/the_pile
dataset_name: pile_pubmed-abstracts
output_type: loglikelihood_rolling
test_split: train
template_aliases: ""
doc_to_text: ""
doc_to_target: "{{text}}"
should_decontaminate: true
doc_to_decontamination_query: "{{text}}"
metric_list:
- metric: word_perplexity
aggregation: weighted_perplexity
higher_is_better: false
- metric: byte_perplexity
aggregation: weighted_perplexity
higher_is_better: false
- metric: bits_per_byte
aggregation: bits_per_byte
higher_is_better: false
\ No newline at end of file
group:
- pile
include: pile_arxiv.yaml
task: pile_pubmed-central
dataset_path: EleutherAI/the_pile
dataset_name: pile_pubmed-central
output_type: loglikelihood_rolling
test_split: train
template_aliases: ""
doc_to_text: ""
doc_to_target: "{{text}}"
should_decontaminate: true
doc_to_decontamination_query: "{{text}}"
metric_list:
- metric: word_perplexity
aggregation: weighted_perplexity
higher_is_better: false
- metric: byte_perplexity
aggregation: weighted_perplexity
higher_is_better: false
- metric: bits_per_byte
aggregation: bits_per_byte
higher_is_better: false
\ No newline at end of file
group:
- pile
include: pile_arxiv.yaml
task: pile_stackexchange
dataset_path: EleutherAI/the_pile
dataset_name: pile_stackexchange
output_type: loglikelihood_rolling
test_split: train
template_aliases: ""
doc_to_text: ""
doc_to_target: "{{text}}"
should_decontaminate: true
doc_to_decontamination_query: "{{text}}"
metric_list:
- metric: word_perplexity
aggregation: weighted_perplexity
higher_is_better: false
- metric: byte_perplexity
aggregation: weighted_perplexity
higher_is_better: false
- metric: bits_per_byte
aggregation: bits_per_byte
higher_is_better: false
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment