conflict solved

c69f6c38 · cardy20 · 18c0fa29 · c69f6c38 · c69f6c38 · c69f6c38
Commit c69f6c38 authored Jun 03, 2023 by cardy20
Hide whitespace changes
Inline Side-by-side

Showing with 90 additions and 26 deletions

get_ngram.sh get_ngram.sh +4 -0

get_ngram2.sh get_ngram2.sh +4 -0

lm_eval/tasks/__init__.py lm_eval/tasks/__init__.py +54 -26

ngrams.log ngrams.log +28 -0

No files found.
--- a/get_ngram.sh
+++ b/get_ngram.sh
+export PYTHONPATH=$PWD
+python3 scripts/clean_training_data/generate_13_grams.py \
+        -dir /fsx/polyglot/massivetext_large_data/ \
+        -sdir /fsx/lime12/ngram_train2/ -n 13 -buckets 500
--- a/get_ngram2.sh
+++ b/get_ngram2.sh
+export PYTHONPATH=$PWD
+python3 scripts/clean_training_data/generate_13_grams.py \
+        -dir /fsx/kevinai/data/ko/merged_raw/ \
+        -sdir /fsx/lime12/ngram_merged_raw -n 13 -buckets 500
\ No newline at end of file
--- a/lm_eval/tasks/__init__.py
+++ b/lm_eval/tasks/__init__.py
@@ -50,18 +50,24 @@ from . import blimp
 from . import asdiv
 from . import gsm8k
 from . import storycloze
-<<<<<<< HEAD
 from . import kobest
 from . import nsmc
 from . import klue
 from . import ko_translation
 from . import korquad
+from . import korunsmile
 from . import kohatespeech
-=======
+from . import kold
 from . import toxigen
 from . import crowspairs
+from . import json
+from . import xcopa
 from . import bigbench
->>>>>>> 0542d35d5e56768dd9041ef9b88b90256970d843
+from . import xstorycloze
+from . import xwinograd
+from . import pawsx
+from . import xnli
+from . import mgsm

 ########################################
 # Translation tasks
@@ -113,15 +119,6 @@ TASK_REGISTRY = {
    "record": superglue.ReCoRD,
    "wic": superglue.WordsInContext,
    "wsc": superglue.SGWinogradSchemaChallenge,
-<<<<<<< HEAD
-
-    # Order by benchmark/genre?
-    "coqa": coqa.CoQA,
-    "drop": drop.DROP,
-    "lambada": lambada.LAMBADA,
-    "lambada_cloze": lambada_cloze.LAMBADA_cloze,
-
-=======
    # Order by benchmark/genre?
    "coqa": coqa.CoQA,
    "drop": drop.DROP,
@@ -129,7 +126,6 @@ TASK_REGISTRY = {
    "lambada_standard": lambada.LambadaStandard,
    "lambada_openai_cloze": lambada_cloze.LambadaOpenAICloze,
    "lambada_standard_cloze": lambada_cloze.LambadaStandardCloze,
->>>>>>> 0542d35d5e56768dd9041ef9b88b90256970d843
    # multilingual lambada
    **lambada_multilingual.construct_tasks(),
    "wikitext": wikitext.WikiText,
@@ -235,10 +231,6 @@ TASK_REGISTRY = {
    "pile_ubuntu-irc": pile.PileUbuntuIrc,
    "pile_wikipedia": pile.PileWikipedia,
    "pile_youtubesubtitles": pile.PileYoutubeSubtitles,
-<<<<<<< HEAD
-
-=======
->>>>>>> 0542d35d5e56768dd9041ef9b88b90256970d843
    # BLiMP
    "blimp_adjunct_island": blimp.BlimpAdjunctIsland,
    "blimp_anaphor_gender_agreement": blimp.BlimpAnaphorGenderAgreement,
@@ -307,8 +299,6 @@ TASK_REGISTRY = {
    "blimp_wh_vs_that_no_gap_long_distance": blimp.BlimpWhVsThatNoGapLongDistance,
    "blimp_wh_vs_that_with_gap": blimp.BlimpWhVsThatWithGap,
    "blimp_wh_vs_that_with_gap_long_distance": blimp.BlimpWhVsThatWithGapLongDistance,
-<<<<<<< HEAD
-=======
    "toxigen": toxigen.ToxiGen,
    "crows_pairs_english": crowspairs.CrowsPairsEnglish,
    "crows_pairs_english_race_color": crowspairs.CrowsPairsEnglishRaceColor,
@@ -332,16 +322,16 @@ TASK_REGISTRY = {
    "crows_pairs_french_nationality": crowspairs.CrowsPairsFrenchNationality,
    "crows_pairs_french_physical_appearance": crowspairs.CrowsPairsFrenchPhysicalAppearance,
    "crows_pairs_french_autre": crowspairs.CrowsPairsFrenchAutre,
->>>>>>> 0542d35d5e56768dd9041ef9b88b90256970d843
    # Requires manual download of data.
    # "storycloze_2016": storycloze.StoryCloze2016,
    # "storycloze_2018": storycloze.StoryCloze2018,
    # "sat": sat.SATAnalogies,
-<<<<<<< HEAD
-  
+    "kold_level_a": kold.KoldLevelA,
+    "kold_level_b": kold.KoldLevelB,
    "klue_sts": klue.STS,
    "klue_ynat": klue.YNAT,
    "klue_nli": klue.NLI,
+    "klue_mrc": klue.MRC,
    "nsmc": nsmc.NSMC,    
    "korquad": korquad.Korquad,
    "kobest_boolq": kobest.BoolQ,
@@ -351,19 +341,57 @@ TASK_REGISTRY = {
    "kobest_sentineg": kobest.SentiNeg,
    "ko_en_translation": ko_translation.KoEnTranslation,
    "en_ko_translation": ko_translation.EnKoTranslation,
+    "korunsmile": korunsmile.KorUnSmile,
    "kohatespeech":kohatespeech.HateSpeech,
    "kohatespeech_gen_bias":kohatespeech.GenderBias,
-    "kohatespeech_apeach":kohatespeech.Apeach
-=======
+    "kohatespeech_apeach":kohatespeech.Apeach,
+    **xcopa.construct_tasks(),
    **bigbench.create_all_tasks(),
->>>>>>> 0542d35d5e56768dd9041ef9b88b90256970d843
+    **xstorycloze.create_all_tasks(),
+    **xwinograd.create_all_tasks(),
+    **pawsx.construct_tasks(),
+    **xnli.construct_tasks(),
+    **mgsm.construct_tasks(),
 }

 ALL_TASKS = sorted(list(TASK_REGISTRY))

+_EXAMPLE_JSON_PATH = "split:key:/absolute/path/to/data.json"
+
+
+def add_json_task(task_name):
+    """Add a JSON perplexity task if the given task name matches the
+    JSON task specification.
+
+    See `json.JsonPerplexity`.
+    """
+    if not task_name.startswith("json"):
+        return
+
+    def create_json_task():
+        splits = task_name.split("=", 1)
+        if len(splits) != 2 or not splits[1]:
+            raise ValueError(
+                "json tasks need a path argument pointing to the local "
+                "dataset, specified like this: json="
+                + _EXAMPLE_JSON_PATH
+                + ' (if there are no splits, use "train")'
+            )
+
+        json_path = splits[1]
+        if json_path == _EXAMPLE_JSON_PATH:
+            raise ValueError(
+                "please do not copy the example path directly, but substitute "
+                "it with a path to your local dataset"
+            )
+        return lambda: json.JsonPerplexity(json_path)
+
+    TASK_REGISTRY[task_name] = create_json_task()
+

 def get_task(task_name):
    try:
+        add_json_task(task_name)
        return TASK_REGISTRY[task_name]
    except KeyError:
        print("Available tasks:")
@@ -396,4 +424,4 @@ def get_task_dict(task_name_list: List[Union[str, lm_eval.base.Task]]):
        if not isinstance(task_object, str)
    }
    assert set(task_name_dict.keys()).isdisjoint(set(task_name_from_object_dict.keys()))
-    return {**task_name_dict, **task_name_from_object_dict}
+    return {**task_name_dict, **task_name_from_object_dict}
\ No newline at end of file
--- a/ngrams.log
+++ b/ngrams.log
+INFO - 05/29/23 02:24:05 - 0:00:00 - Generating 13-grams and bucketing.
+INFO - 05/29/23 02:24:05 - 0:00:00 - Starting at pile document index 0
+INFO - 05/29/23 02:26:29 - 0:00:00 - Generating 13-grams and bucketing.
+INFO - 05/29/23 02:26:29 - 0:00:00 - Starting at pile document index 106000
+INFO - 05/29/23 02:29:19 - 0:00:00 - Generating 13-grams and bucketing.
+INFO - 05/29/23 02:29:19 - 0:00:00 - Starting at pile document index 0
+INFO - 05/29/23 02:31:50 - 0:00:00 - Generating 13-grams and bucketing.
+INFO - 05/29/23 02:31:50 - 0:00:00 - Starting at pile document index 0
+INFO - 05/29/23 02:32:22 - 0:00:00 - Generating 13-grams and bucketing.
+INFO - 05/29/23 02:32:22 - 0:00:00 - ngrams already generated and bucketed, skipping
+INFO - 05/29/23 02:34:01 - 0:00:00 - Generating 13-grams and bucketing.
+INFO - 05/29/23 02:34:01 - 0:00:00 - Starting at pile document index 0
+INFO - 05/29/23 02:34:58 - 0:00:00 - Generating 13-grams and bucketing.
+INFO - 05/29/23 02:34:58 - 0:00:00 - Starting at pile document index 0
+INFO - 05/29/23 07:12:33 - 0:00:00 - Generating 13-grams and bucketing.
+INFO - 05/29/23 07:12:33 - 0:00:00 - Starting at pile document index 0
+INFO - 05/29/23 07:26:46 - 0:00:00 - Generating 13-grams and bucketing.
+INFO - 05/29/23 07:26:46 - 0:00:00 - Starting at pile document index 0
+INFO - 05/29/23 07:30:21 - 0:00:00 - Generating 13-grams and bucketing.
+INFO - 05/29/23 07:30:21 - 0:00:00 - Starting at pile document index 0
+INFO - 05/29/23 07:31:54 - 0:00:00 - Generating 13-grams and bucketing.
+INFO - 05/29/23 07:31:54 - 0:00:00 - Starting at pile document index 0
+INFO - 05/29/23 13:27:39 - 0:00:00 - Generating 13-grams and bucketing.
+INFO - 05/29/23 13:27:39 - 0:00:00 - Starting at pile document index 8432000
+INFO - 05/29/23 13:30:28 - 0:00:00 - Generating 13-grams and bucketing.
+INFO - 05/29/23 13:30:28 - 0:00:00 - Starting at pile document index 0
+INFO - 05/29/23 14:27:00 - 0:00:00 - Generating 13-grams and bucketing.
+INFO - 05/29/23 14:27:00 - 0:00:00 - Starting at pile document index 0