added deepseekv2

74df9bea · zhaoying1 · 74df9bea · 74df9bea · 74df9bea · 74df9bea
Commit 74df9bea authored Sep 02, 2024 by zhaoying1
20 changed files
--- a/LM-Evaluation-Harness-240310/lm_eval/tasks/ammlu/ammlu_medical_genetics.yaml
+++ b/LM-Evaluation-Harness-240310/lm_eval/tasks/ammlu/ammlu_medical_genetics.yaml
+"dataset_name": "medical_genetics"
+"description": "فم بعملية التقييم في مجال علوم أخرى \n\n"
+"include": "_default_template_yaml"
+"task": "ammlu_medical_genetics"
--- a/LM-Evaluation-Harness-240310/lm_eval/tasks/ammlu/ammlu_miscellaneous.yaml
+++ b/LM-Evaluation-Harness-240310/lm_eval/tasks/ammlu/ammlu_miscellaneous.yaml
+"dataset_name": "miscellaneous"
+"description": "فم بعملية التقييم في مجال علوم أخرى \n\n"
+"include": "_default_template_yaml"
+"task": "ammlu_miscellaneous"
--- a/LM-Evaluation-Harness-240310/lm_eval/tasks/ammlu/ammlu_moral_disputes.yaml
+++ b/LM-Evaluation-Harness-240310/lm_eval/tasks/ammlu/ammlu_moral_disputes.yaml
+"dataset_name": "moral_disputes"
+"description": "فم بعملية التقييم في مجال العلوم الانسانية \n\n"
+"include": "_default_template_yaml"
+"task": "ammlu_moral_disputes"
--- a/LM-Evaluation-Harness-240310/lm_eval/tasks/ammlu/ammlu_moral_scenarios.yaml
+++ b/LM-Evaluation-Harness-240310/lm_eval/tasks/ammlu/ammlu_moral_scenarios.yaml
+"dataset_name": "moral_scenarios"
+"description": "فم بعملية التقييم في مجال العلوم الانسانية \n\n"
+"include": "_default_template_yaml"
+"task": "ammlu_moral_scenarios"
--- a/LM-Evaluation-Harness-240310/lm_eval/tasks/ammlu/ammlu_nutrition.yaml
+++ b/LM-Evaluation-Harness-240310/lm_eval/tasks/ammlu/ammlu_nutrition.yaml
+"dataset_name": "nutrition"
+"description": "فم بعملية التقييم في مجال علوم أخرى \n\n"
+"include": "_default_template_yaml"
+"task": "ammlu_nutrition"
--- a/LM-Evaluation-Harness-240310/lm_eval/tasks/ammlu/ammlu_philosophy.yaml
+++ b/LM-Evaluation-Harness-240310/lm_eval/tasks/ammlu/ammlu_philosophy.yaml
+"dataset_name": "philosophy"
+"description": "فم بعملية التقييم في مجال العلوم الانسانية \n\n"
+"include": "_default_template_yaml"
+"task": "ammlu_philosophy"
--- a/LM-Evaluation-Harness-240310/lm_eval/tasks/ammlu/ammlu_prehistory.yaml
+++ b/LM-Evaluation-Harness-240310/lm_eval/tasks/ammlu/ammlu_prehistory.yaml
+"dataset_name": "prehistory"
+"description": "فم بعملية التقييم في مجال العلوم الانسانية \n\n"
+"include": "_default_template_yaml"
+"task": "ammlu_prehistory"
--- a/LM-Evaluation-Harness-240310/lm_eval/tasks/ammlu/ammlu_professional_accounting.yaml
+++ b/LM-Evaluation-Harness-240310/lm_eval/tasks/ammlu/ammlu_professional_accounting.yaml
+"dataset_name": "professional_accounting"
+"description": "فم بعملية التقييم في مجال علوم أخرى \n\n"
+"include": "_default_template_yaml"
+"task": "ammlu_professional_accounting"
--- a/LM-Evaluation-Harness-240310/lm_eval/tasks/ammlu/ammlu_professional_law.yaml
+++ b/LM-Evaluation-Harness-240310/lm_eval/tasks/ammlu/ammlu_professional_law.yaml
+"dataset_name": "professional_law"
+"description": "فم بعملية التقييم في مجال العلوم الانسانية \n\n"
+"include": "_default_template_yaml"
+"task": "ammlu_professional_law"
--- a/LM-Evaluation-Harness-240310/lm_eval/tasks/ammlu/ammlu_professional_medicine.yaml
+++ b/LM-Evaluation-Harness-240310/lm_eval/tasks/ammlu/ammlu_professional_medicine.yaml
+"dataset_name": "professional_medicine"
+"description": "فم بعملية التقييم في مجال علوم أخرى \n\n"
+"include": "_default_template_yaml"
+"task": "ammlu_professional_medicine"
--- a/LM-Evaluation-Harness-240310/lm_eval/tasks/ammlu/ammlu_professional_psychology.yaml
+++ b/LM-Evaluation-Harness-240310/lm_eval/tasks/ammlu/ammlu_professional_psychology.yaml
+"dataset_name": "professional_psychology"
+"description": "فم بعملية التقييم في مجال العلوم الإجتماعية \n\n"
+"include": "_default_template_yaml"
+"task": "ammlu_professional_psychology"
--- a/LM-Evaluation-Harness-240310/lm_eval/tasks/ammlu/ammlu_public_relations.yaml
+++ b/LM-Evaluation-Harness-240310/lm_eval/tasks/ammlu/ammlu_public_relations.yaml
+"dataset_name": "public_relations"
+"description": "فم بعملية التقييم في مجال العلوم الإجتماعية \n\n"
+"include": "_default_template_yaml"
+"task": "ammlu_public_relations"
--- a/LM-Evaluation-Harness-240310/lm_eval/tasks/ammlu/ammlu_security_studies.yaml
+++ b/LM-Evaluation-Harness-240310/lm_eval/tasks/ammlu/ammlu_security_studies.yaml
+"dataset_name": "security_studies"
+"description": "فم بعملية التقييم في مجال العلوم الإجتماعية \n\n"
+"include": "_default_template_yaml"
+"task": "ammlu_security_studies"
--- a/LM-Evaluation-Harness-240310/lm_eval/tasks/ammlu/ammlu_sociology.yaml
+++ b/LM-Evaluation-Harness-240310/lm_eval/tasks/ammlu/ammlu_sociology.yaml
+"dataset_name": "sociology"
+"description": "فم بعملية التقييم في مجال العلوم الإجتماعية \n\n"
+"include": "_default_template_yaml"
+"task": "ammlu_sociology"
--- a/LM-Evaluation-Harness-240310/lm_eval/tasks/ammlu/ammlu_us_foreign_policy.yaml
+++ b/LM-Evaluation-Harness-240310/lm_eval/tasks/ammlu/ammlu_us_foreign_policy.yaml
+"dataset_name": "us_foreign_policy"
+"description": "فم بعملية التقييم في مجال العلوم الإجتماعية \n\n"
+"include": "_default_template_yaml"
+"task": "ammlu_us_foreign_policy"
--- a/LM-Evaluation-Harness-240310/lm_eval/tasks/ammlu/ammlu_virology.yaml
+++ b/LM-Evaluation-Harness-240310/lm_eval/tasks/ammlu/ammlu_virology.yaml
+"dataset_name": "virology"
+"description": "فم بعملية التقييم في مجال علوم أخرى \n\n"
+"include": "_default_template_yaml"
+"task": "ammlu_virology"
--- a/LM-Evaluation-Harness-240310/lm_eval/tasks/ammlu/ammlu_world_religions.yaml
+++ b/LM-Evaluation-Harness-240310/lm_eval/tasks/ammlu/ammlu_world_religions.yaml
+"dataset_name": "world_religions"
+"description": "فم بعملية التقييم في مجال العلوم الانسانية \n\n"
+"include": "_default_template_yaml"
+"task": "ammlu_world_religions"
--- a/LM-Evaluation-Harness-240310/lm_eval/tasks/anli/README.md
+++ b/LM-Evaluation-Harness-240310/lm_eval/tasks/anli/README.md
+# ANLI
+
+### Paper
+
+Title: `Adversarial NLI: A New Benchmark for Natural Language Understanding`
+
+Paper Link: https://arxiv.org/abs/1910.14599
+
+Adversarial NLI (ANLI) is a dataset collected via an iterative, adversarial
+human-and-model-in-the-loop procedure. It consists of three rounds that progressively
+increase in difficulty and complexity, and each question-answer includes annotator-
+provided explanations.
+
+Homepage: https://github.com/facebookresearch/anli
+
+### Citation
+
+```
+@inproceedings{nie-etal-2020-adversarial,
+    title = "Adversarial {NLI}: A New Benchmark for Natural Language Understanding",
+    author = "Nie, Yixin  and
+      Williams, Adina  and
+      Dinan, Emily  and
+      Bansal, Mohit  and
+      Weston, Jason  and
+      Kiela, Douwe",
+    booktitle = "Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics",
+    year = "2020",
+    publisher = "Association for Computational Linguistics",
+}
+```
+
+### Groups and Tasks
+
+#### Groups
+
+* `anli`: Evaluates `anli_r1`, `anli_r2`, and `anli_r3`
+
+#### Tasks
+* `anli_r1`: The data collected adversarially in the first round.
+* `anli_r2`: The data collected adversarially in the second round, after training on the previous round's data.
+* `anli_r3`: The data collected adversarially in the third round, after training on the previous multiple rounds of data.
+
+
+### Checklist
+
+For adding novel benchmarks/datasets to the library:
+  * [x] Is the task an existing benchmark in the literature?
+  * [x] Have you referenced the original paper that introduced the task?
+  * [ ] If yes, does the original paper provide a reference implementation? If so, have you checked against the reference implementation and documented how to run such a test?
+
+
+If other tasks on this dataset are already supported:
+* [ ] Is the "Main" variant of this task clearly denoted?
+* [x] Have you provided a short sentence in a README on what each new variant adds / evaluates?
+* [ ] Have you noted which, if any, published evaluation setups are matched by this variant?
--- a/LM-Evaluation-Harness-240310/lm_eval/tasks/anli/anli_r1.yaml
+++ b/LM-Evaluation-Harness-240310/lm_eval/tasks/anli/anli_r1.yaml
+group:
+  - anli
+task: anli_r1
+dataset_path: anli
+dataset_name: null
+output_type: multiple_choice
+training_split: train_r1
+validation_split: dev_r1
+test_split: test_r1
+doc_to_text: "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither?\nAnswer:"
+# True = entailment
+# False = contradiction
+# Neither = neutral
+doc_to_target: "{{['True', 'Neither', 'False'][label]}}"
+doc_to_choice:
+  - "True"
+  - "Neither"
+  - "False"
+should_decontaminate: true
+doc_to_decontamination_query: premise
+metric_list:
+  - metric: acc
+    aggregation: mean
+    higher_is_better: true
+metadata:
+  version: 1.0
--- a/LM-Evaluation-Harness-240310/lm_eval/tasks/anli/anli_r2.yaml
+++ b/LM-Evaluation-Harness-240310/lm_eval/tasks/anli/anli_r2.yaml
+include: anli_r1.yaml
+task: anli_r2
+training_split: train_r2
+validation_split: dev_r2
+test_split: test_r2