Commit e0311cd5 authored by baberabb's avatar baberabb
Browse files

Merge remote-tracking branch 'origin/big-refactor' into big-refactor_python_final

parents 96c60cf6 f86d6874
......@@ -69,7 +69,7 @@ class OpenaiCompletionsLM(LM):
engine: str = "text-davinci-003",
truncate: bool = False,
batch_size: int = 1,
):
) -> None:
"""
:param engine: str
......@@ -99,12 +99,12 @@ class OpenaiCompletionsLM(LM):
return self.end_of_text_token_id
@property
def max_length(self):
def max_length(self) -> int:
# Note: the OpenAI API supports up to 2049 tokens, with the first token being the first input token
return 2048
@property
def max_gen_toks(self):
def max_gen_toks(self) -> int:
return 256
@property
......@@ -152,7 +152,7 @@ class OpenaiCompletionsLM(LM):
return self._loglikelihood_tokens(new_reqs)
def _loglikelihood_tokens(
self, requests, disable_tqdm=False
self, requests, disable_tqdm: bool = False
) -> List[Tuple[float, bool]]:
res = []
......
......@@ -41,7 +41,7 @@ def textsynth_completion(**kwargs):
@register_model("textsynth")
class TextSynthLM(LM):
def __init__(self, engine, truncate=False):
def __init__(self, engine, truncate: bool = False) -> None:
"""
:param engine: str
TextSynth API engine (e.g. `gptj_6B`)
......@@ -62,12 +62,12 @@ class TextSynthLM(LM):
raise NotImplementedError()
@property
def max_length(self):
def max_length(self) -> int:
# NOTE: Turn on truncation to avoid errors on long inputs.
return 2048
@property
def max_gen_toks(self):
def max_gen_toks(self) -> int:
return 256
@property
......
......@@ -5,7 +5,7 @@ from lm_eval.logger import eval_logger
# Stores prompts in a dictionary indexed by 2 levels:
# prompt category name, and prompt name.
# This allows us to access prompts
PROMPT_REGISTRY = {
PROMPT_REGISTRY: dict[str, dict[str, str]] = {
"qa-basic": {
"question-newline-answer": "Question: {{question}}\nAnswer:",
"q-newline-a": "Q: {{question}}\nA:",
......@@ -13,7 +13,7 @@ PROMPT_REGISTRY = {
}
def get_prompt(prompt_id: str, dataset_name=None, subset_name=None):
def get_prompt(prompt_id: str, dataset_name: str = None, subset_name: str = None):
# unpack prompt name
category_name, prompt_name = prompt_id.split(":")
if subset_name is None:
......
......@@ -15,7 +15,7 @@ from lm_eval.api.registry import (
)
def register_configurable_task(config):
def register_configurable_task(config: dict[str, str]) -> int:
SubClass = type(
config["task"] + "ConfigurableTask",
(ConfigurableTask,),
......@@ -38,7 +38,7 @@ def register_configurable_task(config):
return 0
def check_prompt_config(config):
def check_prompt_config(config: dict[str, str]) -> List[dict[str, str]]:
all_configs = []
if "use_prompt" in config:
prompt_list = prompts.load_prompt_list(
......@@ -69,14 +69,14 @@ def check_prompt_config(config):
return all_configs
def get_task_name_from_config(task_config):
def get_task_name_from_config(task_config: dict[str, str]) -> str:
if "dataset_name" in task_config:
return "{dataset_path}_{dataset_name}".format(**task_config)
else:
return "{dataset_path}".format(**task_config)
def include_task_folder(task_dir):
def include_task_folder(task_dir: str) -> None:
"""
Calling this function
"""
......
def doc_to_text(doc):
def doc_to_text(doc) -> str:
return "{}\nQuestion: {} True, False or Neither?\nAnswer:".format(
doc["premise"],
doc["hypothesis"].strip()
......
......@@ -15,7 +15,7 @@ def _preproc_doc(doc):
return doc
def doc_to_text(doc):
def doc_to_text(doc) -> str:
doc = _preproc_doc(doc)
return f"Scenario 1: {doc['scenarios'][0]}\nScenario 2: {doc['scenarios'][1]}\nQuestion: Is Scenario 1 preferable?\nAnswer:"
......
def doc_to_text(doc):
def doc_to_text(doc) -> str:
ctxs = "\n".join(doc["context"]["contexts"])
return "Abstract: {}\nQuestion: {}\nAnswer:".format(
ctxs, doc["question"], doc["final_decision"]
)
def doc_to_target(doc):
def doc_to_target(doc) -> str:
return " {}".format(doc["final_decision"])
......
# Translation Tasks
### Paper
### Citation
```
```
### Groups and Tasks
#### Groups
* `gpt3_translation_tasks`
* `wmt14`
* `wmt16`
* `wmt20`
* `iwslt2017`
#### Tasks
*
### Checklist
For adding novel benchmarks/datasets to the library:
* [x] Is the task an existing benchmark in the literature?
* [ ] Have you referenced the original paper that introduced the task?
* [ ] If yes, does the original paper provide a reference implementation? If so, have you checked against the reference implementation and documented how to run such a test?
If other tasks on this dataset are already supported:
* [x] Is the "Main" variant of this task clearly denoted?
* [ ] Have you provided a short sentence in a README on what each new variant adds / evaluates?
* [ ] Have you noted which, if any, published evaluation setups are matched by this variant?
* [ ] Checked for equivalence with v0.3.0 LM Evaluation Harness
# Generated by utils.py
dataset_name: iwslt2017-en-ar
dataset_path: iwslt2017
doc_to_target: ' {{translation["en"]}}'
doc_to_text: 'Arabic phrase: {{translation["ar"]}}
English phrase:'
group:
- greedy_until
- translation
- iwslt2017
include: wmt_common_yaml
task: iwslt2017-ar-en
# Generated by utils.py
dataset_name: iwslt2017-en-ar
dataset_path: iwslt2017
doc_to_target: ' {{translation["ar"]}}'
doc_to_text: 'English phrase: {{translation["en"]}}
Arabic phrase:'
group:
- greedy_until
- translation
- iwslt2017
include: wmt_common_yaml
task: iwslt2017-en-ar
import argparse
from typing import Dict, List
import yaml
import sacrebleu
try:
import pycountry
except ModuleNotFoundError:
raise Exception(
"`pycountry` is required for generating translation task prompt templates. \
please install pycountry via pip install lm-eval[multilingua] or pip install -e .[multilingual]",
)
# Different translation benchmarks included in the library. Mostly WMT.
# These correspond to dataset names (subsets) on HuggingFace for each dataset.
# A yaml file is generated by this script for each language pair.
gpt3_translation_benchmarks = {
"wmt14": ["fr-en"], # ["en-fr", "fr-en"], # French
"wmt16": [
"ro-en",
"de-en",
], # ["en-ro", "ro-en", "de-en", "en-de"], # German, Romanian
}
# 28 total
LANGUAGES = {
**gpt3_translation_benchmarks,
# "wmt20": sacrebleu.get_langpairs_for_testset("wmt20"),
"iwslt2017": ["en-ar"], # Arabic
}
def code_to_language(code):
# key is alpha_2 or alpha_3 depending on the code length
language_tuple = pycountry.languages.get(**{f"alpha_{len(code)}": code})
return language_tuple.name
def gen_lang_yamls(output_dir: str, overwrite: bool) -> None:
"""
Generate a yaml file for each language.
:param output_dir: The directory to output the files to.
:param overwrite: Whether to overwrite files if they already exist.
"""
err = []
for lang in LANGUAGES.keys():
for dataset_name in LANGUAGES[lang]:
src_lang, _, tgt_lang = dataset_name.partition("-")
for src, tgt in [[src_lang, tgt_lang], [tgt_lang, src_lang]]:
# both translation directions for each lang pair
lang_pair = src + "-" + tgt
file_name = f"{lang}_{lang_pair}.yaml"
try:
source, target = code_to_language(src), code_to_language(tgt)
groups = ["greedy_until", "translation", lang]
if lang in gpt3_translation_benchmarks.keys():
groups += ["gpt3_translation_benchmarks"]
with open(
f"{output_dir}/{file_name}",
"w" if overwrite else "x",
encoding="utf8",
) as f:
f.write("# Generated by utils.py\n")
yaml.dump(
{
"include": "wmt_common_yaml",
"group": groups,
"dataset_path": lang,
"dataset_name": dataset_name
if not (lang == "iwslt2017")
else "iwslt2017-" + dataset_name,
"task": f"{lang}-{lang_pair}",
"doc_to_text": f"{source} phrase: "
+ "{{translation["
+ f'"{src}"'
+ "]}}\n"
+ f"{target} phrase:",
"doc_to_target": " {{"
+ "translation["
+ f'"{tgt}"]'
+ "}}",
},
f,
)
except FileExistsError:
err.append(file_name)
if len(err) > 0:
raise FileExistsError(
"Files were not created because they already exist (use --overwrite flag):"
f" {', '.join(err)}"
)
def main() -> None:
"""Parse CLI args and generate language-specific yaml files."""
parser = argparse.ArgumentParser()
parser.add_argument(
"--overwrite",
default=False,
action="store_true",
help="Overwrite files if they already exist",
)
parser.add_argument(
"--output-dir", default=".", help="Directory to write yaml files to"
)
args = parser.parse_args()
gen_lang_yamls(output_dir=args.output_dir, overwrite=args.overwrite)
if __name__ == "__main__":
main()
# Generated by utils.py
dataset_name: fr-en
dataset_path: wmt14
doc_to_target: ' {{translation["fr"]}}'
doc_to_text: 'English phrase: {{translation["en"]}}
French phrase:'
group:
- greedy_until
- translation
- wmt14
- gpt3_translation_benchmarks
include: wmt_common_yaml
task: wmt14-en-fr
# Generated by utils.py
dataset_name: fr-en
dataset_path: wmt14
doc_to_target: ' {{translation["en"]}}'
doc_to_text: 'French phrase: {{translation["fr"]}}
English phrase:'
group:
- greedy_until
- translation
- wmt14
- gpt3_translation_benchmarks
include: wmt_common_yaml
task: wmt14-fr-en
# Generated by utils.py
dataset_name: de-en
dataset_path: wmt16
doc_to_target: ' {{translation["en"]}}'
doc_to_text: 'German phrase: {{translation["de"]}}
English phrase:'
group:
- greedy_until
- translation
- wmt16
- gpt3_translation_benchmarks
include: wmt_common_yaml
task: wmt16-de-en
# Generated by utils.py
dataset_name: de-en
dataset_path: wmt16
doc_to_target: ' {{translation["de"]}}'
doc_to_text: 'English phrase: {{translation["en"]}}
German phrase:'
group:
- greedy_until
- translation
- wmt16
- gpt3_translation_benchmarks
include: wmt_common_yaml
task: wmt16-en-de
# Generated by utils.py
dataset_name: ro-en
dataset_path: wmt16
doc_to_target: ' {{translation["ro"]}}'
doc_to_text: 'English phrase: {{translation["en"]}}
Romanian phrase:'
group:
- greedy_until
- translation
- wmt16
- gpt3_translation_benchmarks
include: wmt_common_yaml
task: wmt16-en-ro
# Generated by utils.py
dataset_name: ro-en
dataset_path: wmt16
doc_to_target: ' {{translation["en"]}}'
doc_to_text: 'Romanian phrase: {{translation["ro"]}}
English phrase:'
group:
- greedy_until
- translation
- wmt16
- gpt3_translation_benchmarks
include: wmt_common_yaml
task: wmt16-ro-en
output_type: greedy_until
training_split: train
validation_split: validation
fewshot_split: validation
test_split: test
metric_list:
- metric: bleu
- metric: ter
- metric: chrf
generation_kwargs:
until:
- "\n"
do_sample: false
temperature: 0.0
repeats: 1
......@@ -34,3 +34,15 @@ def wikitext_detokenizer(doc):
string = string.replace(" 's", "'s")
return string
def process_results(doc, results):
(loglikelihood,) = results
# IMPORTANT: wikitext counts number of words in *original doc before detokenization*
_words = len(re.split(r"\s+", doc["page"]))
_bytes = len(doc["page"].encode("utf-8"))
return {
"word_perplexity": (loglikelihood, _words),
"byte_perplexity": (loglikelihood, _bytes),
"bits_per_byte": (loglikelihood, _bytes),
}
......@@ -7,6 +7,7 @@ validation_split: validation
test_split: test
doc_to_text: ""
doc_to_target: !function preprocess_wikitext.wikitext_detokenizer
process_results: !function preprocess_wikitext.process_results
should_decontaminate: true
doc_to_decontamination_query: "{{page}}"
metric_list:
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment