resolved merge conflict

9f518392 · lintangsutawika · 37ccb191 · bf26d979 · 9f518392 · 9f518392
Commit 9f518392 authored Nov 28, 2023 by lintangsutawika
20 changed files
--- a/lm_eval/tasks/bbh/flan_zeroshot/navigate.yaml
+++ b/lm_eval/tasks/bbh/flan_zeroshot/navigate.yaml
 "dataset_name": "navigate"
 "description": "Given a series of navigation instructions, determine whether one would end up back at the starting point.\n\n"
 "doc_to_text": "Q: {{input}}\nA:"
-"include": "_flan_zeroshot_template_yaml"
-"task": "bbh_flan_zeroshot_navigate"
+"include": "_zeroshot_template_yaml"
+"task": "bbh_zeroshot_navigate"
--- a/lm_eval/tasks/bbh/flan_zeroshot/object_counting.yaml
+++ b/lm_eval/tasks/bbh/flan_zeroshot/object_counting.yaml
 "dataset_name": "object_counting"
 "description": "Questions that involve enumerating objects and asking the model to count them.\n\n"
 "doc_to_text": "Q: {{input}}\nA:"
-"include": "_flan_zeroshot_template_yaml"
-"task": "bbh_flan_zeroshot_object_counting"
+"include": "_zeroshot_template_yaml"
+"task": "bbh_zeroshot_object_counting"
--- a/lm_eval/tasks/bbh/flan_zeroshot/penguins_in_a_table.yaml
+++ b/lm_eval/tasks/bbh/flan_zeroshot/penguins_in_a_table.yaml
 "dataset_name": "penguins_in_a_table"
 "description": "Answer questions about a table of penguins and their attributes.\n\n"
 "doc_to_text": "Q: {{input}}\nA:"
-"include": "_flan_zeroshot_template_yaml"
-"task": "bbh_flan_zeroshot_penguins_in_a_table"
+"include": "_zeroshot_template_yaml"
+"task": "bbh_zeroshot_penguins_in_a_table"
--- a/lm_eval/tasks/bbh/flan_zeroshot/reasoning_about_colored_objects.yaml
+++ b/lm_eval/tasks/bbh/flan_zeroshot/reasoning_about_colored_objects.yaml
 "dataset_name": "reasoning_about_colored_objects"
 "description": "Answer extremely simple questions about the colors of objects on a surface.\n\n"
 "doc_to_text": "Q: {{input}}\nA:"
-"include": "_flan_zeroshot_template_yaml"
-"task": "bbh_flan_zeroshot_reasoning_about_colored_objects"
+"include": "_zeroshot_template_yaml"
+"task": "bbh_zeroshot_reasoning_about_colored_objects"
--- a/lm_eval/tasks/bbh/flan_zeroshot/ruin_names.yaml
+++ b/lm_eval/tasks/bbh/flan_zeroshot/ruin_names.yaml
 "dataset_name": "ruin_names"
 "description": "Select the humorous edit that 'ruins' the input movie or musical artist name.\n\n"
 "doc_to_text": "Q: {{input}}\nA:"
-"include": "_flan_zeroshot_template_yaml"
-"task": "bbh_flan_zeroshot_ruin_names"
+"include": "_zeroshot_template_yaml"
+"task": "bbh_zeroshot_ruin_names"
--- a/lm_eval/tasks/bbh/flan_zeroshot/salient_translation_error_detection.yaml
+++ b/lm_eval/tasks/bbh/flan_zeroshot/salient_translation_error_detection.yaml
 "dataset_name": "salient_translation_error_detection"
 "description": "Detect the type of error in an English translation of a German source sentence.\n\n"
 "doc_to_text": "Q: {{input}}\nA:"
-"include": "_flan_zeroshot_template_yaml"
-"task": "bbh_flan_zeroshot_salient_translation_error_detection"
+"include": "_zeroshot_template_yaml"
+"task": "bbh_zeroshot_salient_translation_error_detection"
--- a/lm_eval/tasks/bbh/flan_zeroshot/snarks.yaml
+++ b/lm_eval/tasks/bbh/flan_zeroshot/snarks.yaml
 "dataset_name": "snarks"
 "description": "Determine which of two sentences is sarcastic.\n\nAccording to Cambridge University Dictionary, sarcasm is \"the use of remarks that clearly mean the opposite of what they say, made in order to hurt someone's feelings or to criticize something in a humorous way.\" Sarcastic sentences often contain satirical or ironic utterances, hyperboles, ambivalent or witty remarks.\n\n"
 "doc_to_text": "Q: {{input}}\nA:"
-"include": "_flan_zeroshot_template_yaml"
-"task": "bbh_flan_zeroshot_snarks"
+"include": "_zeroshot_template_yaml"
+"task": "bbh_zeroshot_snarks"
--- a/lm_eval/tasks/bbh/flan_zeroshot/sports_understanding.yaml
+++ b/lm_eval/tasks/bbh/flan_zeroshot/sports_understanding.yaml
 "dataset_name": "sports_understanding"
 "description": "Determine whether an artificially constructed sentence relating to sports is plausible or not.\n\n"
 "doc_to_text": "Q: {{input}}\nA:"
-"include": "_flan_zeroshot_template_yaml"
-"task": "bbh_flan_zeroshot_sports_understanding"
+"include": "_zeroshot_template_yaml"
+"task": "bbh_zeroshot_sports_understanding"
--- a/lm_eval/tasks/bbh/flan_zeroshot/temporal_sequences.yaml
+++ b/lm_eval/tasks/bbh/flan_zeroshot/temporal_sequences.yaml
 "dataset_name": "temporal_sequences"
 "description": "Task description: Answer questions about which times certain events could have occurred.\n\n"
 "doc_to_text": "Q: {{input}}\nA:"
-"include": "_flan_zeroshot_template_yaml"
-"task": "bbh_flan_zeroshot_temporal_sequences"
+"include": "_zeroshot_template_yaml"
+"task": "bbh_zeroshot_temporal_sequences"
--- a/lm_eval/tasks/bbh/flan_zeroshot/tracking_shuffled_objects_five_objects.yaml
+++ b/lm_eval/tasks/bbh/flan_zeroshot/tracking_shuffled_objects_five_objects.yaml
 "dataset_name": "tracking_shuffled_objects_five_objects"
 "description": "A task requiring determining the final positions of a set of objects given their initial positions and a description of a sequence of swaps.\n\n"
 "doc_to_text": "Q: {{input}}\nA:"
-"include": "_flan_zeroshot_template_yaml"
-"task": "bbh_flan_zeroshot_tracking_shuffled_objects_five_objects"
+"include": "_zeroshot_template_yaml"
+"task": "bbh_zeroshot_tracking_shuffled_objects_five_objects"
--- a/lm_eval/tasks/bbh/flan_zeroshot/tracking_shuffled_objects_seven_objects.yaml
+++ b/lm_eval/tasks/bbh/flan_zeroshot/tracking_shuffled_objects_seven_objects.yaml
 "dataset_name": "tracking_shuffled_objects_seven_objects"
 "description": "A task requiring determining the final positions of a set of objects given their initial positions and a description of a sequence of swaps.\n\n"
 "doc_to_text": "Q: {{input}}\nA:"
-"include": "_flan_zeroshot_template_yaml"
-"task": "bbh_flan_zeroshot_tracking_shuffled_objects_seven_objects"
+"include": "_zeroshot_template_yaml"
+"task": "bbh_zeroshot_tracking_shuffled_objects_seven_objects"
--- a/lm_eval/tasks/bbh/flan_zeroshot/tracking_shuffled_objects_three_objects.yaml
+++ b/lm_eval/tasks/bbh/flan_zeroshot/tracking_shuffled_objects_three_objects.yaml
 "dataset_name": "tracking_shuffled_objects_three_objects"
 "description": "A task requiring determining the final positions of a set of objects given their initial positions and a description of a sequence of swaps.\n\n"
 "doc_to_text": "Q: {{input}}\nA:"
-"include": "_flan_zeroshot_template_yaml"
-"task": "bbh_flan_zeroshot_tracking_shuffled_objects_three_objects"
+"include": "_zeroshot_template_yaml"
+"task": "bbh_zeroshot_tracking_shuffled_objects_three_objects"
--- a/lm_eval/tasks/bbh/flan_zeroshot/web_of_lies.yaml
+++ b/lm_eval/tasks/bbh/flan_zeroshot/web_of_lies.yaml
 "dataset_name": "web_of_lies"
 "description": "Evaluate a random boolean function expressed as a word problem.\n\n"
 "doc_to_text": "Q: {{input}}\nA:"
-"include": "_flan_zeroshot_template_yaml"
-"task": "bbh_flan_zeroshot_web_of_lies"
+"include": "_zeroshot_template_yaml"
+"task": "bbh_zeroshot_web_of_lies"
--- a/lm_eval/tasks/bbh/flan_zeroshot/word_sorting.yaml
+++ b/lm_eval/tasks/bbh/flan_zeroshot/word_sorting.yaml
 "dataset_name": "word_sorting"
 "description": "Sort a list of words.\n\n"
 "doc_to_text": "Q: {{input}}\nA:"
-"include": "_flan_zeroshot_template_yaml"
-"task": "bbh_flan_zeroshot_word_sorting"
+"include": "_zeroshot_template_yaml"
+"task": "bbh_zeroshot_word_sorting"
--- a/lm_eval/tasks/bigbench/generate_until_template_yaml
+++ b/lm_eval/tasks/bigbench/generate_until_template_yaml
-group: bigbench
-dataset_path: bigbench # will switch to `hails/bigbench` when all tasks are pushed
+group: bigbench_generate_until
+dataset_path: hails/bigbench
 output_type: generate_until
 dataset_kwargs:
  # num_shots: 0 # TODO: num of shots for `bigbench` HF dataset should be controlled through this, not through the typical methods

--- a/lm_eval/tasks/bigbench/multiple_choice_template_yaml
+++ b/lm_eval/tasks/bigbench/multiple_choice_template_yaml
-group: bigbench
-dataset_path: bigbench # will switch to `hails/bigbench` when all tasks are pushed
+group: bigbench_multiple_choice
+dataset_path: hails/bigbench
 dataset_kwargs:
  # num_shots: 0 # TODO: num of shots for `bigbench` HF dataset should be controlled through this, not through the typical methods
  # subtask_name: null

--- a/lm_eval/tasks/minerva_math/utils.py
+++ b/lm_eval/tasks/minerva_math/utils.py
 import datasets
 import re
 import signal
-from lm_eval.logger import eval_logger
+from lm_eval.utils import eval_logger
 from typing import Optional, List, Dict

 try:

--- a/lm_eval/tasks/realtoxicityprompts/metric.py
+++ b/lm_eval/tasks/realtoxicityprompts/metric.py
@@ -3,7 +3,7 @@ import json
 import requests
 import numpy as np

-from lm_eval.logger import eval_logger
+from lm_eval.utils import eval_logger


 def toxicity_perspective_api(references, predictions, **kwargs):

--- a/lm_eval/tasks/scrolls/README.md
+++ b/lm_eval/tasks/scrolls/README.md
+"""
+SCROLLS: Standardized CompaRison Over Long Language Sequences
+https://arxiv.org/abs/2201.03533
+
+SCROLLS is a suite of datasets that require synthesizing information over long texts.
+The benchmark includes seven natural language tasks across multiple domains,
+including summarization, question answering, and natural language inference.
+
+Homepage: https://www.scrolls-benchmark.com/
+
+Since SCROLLS tasks are generally longer than the maximum sequence length of many models,
+it is possible to create "subset" tasks that contain only those samples whose tokenized length
+is less than some pre-defined limit. For example, to create a subset of "Qasper" that would
+be suitable for a model using the GPTNeoX tokenizer and a 4K maximium sequence length:
+
+```
+class QasperGPTNeoX4K(Qasper):
+    PRUNE_TOKENIZERS = ["EleutherAI/pythia-410m-deduped"]
+    PRUNE_MAX_TOKENS = 4096
+    PRUNE_NUM_PROC = _num_cpu_cores() # optional, to speed up pruning of large datasets like NarrativeQA
+```
+
+`PRUNE_TOKENIZERS` can contain more than one tokenizer; this will include only samples that are
+less than `PRUNE_MAX_TOKENS` for ALL of the tokenizers. This can be useful to comparing models
+that use different tokenizers but the same maximum sequence length.
+
+Once the subset task class has been defined in this file, it can be used by adding the class
+to `lm_eval/tasks/__init__.py`.
+
+NOTE: GovReport may need `max_gen_toks` set larger for causal models.
+"""
--- a/lm_eval/tasks/scrolls/scrolls.yaml
+++ b/lm_eval/tasks/scrolls/scrolls.yaml
+group: scrolls
+task:
+  - scrolls_qasper
+  - scrolls_quality
+  - scrolls_narrativeqa
+  - scrolls_contractnli
+  - scrolls_govreport
+  - scrolls_summscreenfd
+  - scrolls_qmsum