Commit 74df9bea authored by zhaoying1's avatar zhaoying1
Browse files

added deepseekv2

parents
Pipeline #1652 failed with stages
in 0 seconds
"dataset_name": "medical_genetics"
"description": "فم بعملية التقييم في مجال علوم أخرى \n\n"
"include": "_default_template_yaml"
"task": "ammlu_medical_genetics"
"dataset_name": "miscellaneous"
"description": "فم بعملية التقييم في مجال علوم أخرى \n\n"
"include": "_default_template_yaml"
"task": "ammlu_miscellaneous"
"dataset_name": "moral_disputes"
"description": "فم بعملية التقييم في مجال العلوم الانسانية \n\n"
"include": "_default_template_yaml"
"task": "ammlu_moral_disputes"
"dataset_name": "moral_scenarios"
"description": "فم بعملية التقييم في مجال العلوم الانسانية \n\n"
"include": "_default_template_yaml"
"task": "ammlu_moral_scenarios"
"dataset_name": "nutrition"
"description": "فم بعملية التقييم في مجال علوم أخرى \n\n"
"include": "_default_template_yaml"
"task": "ammlu_nutrition"
"dataset_name": "philosophy"
"description": "فم بعملية التقييم في مجال العلوم الانسانية \n\n"
"include": "_default_template_yaml"
"task": "ammlu_philosophy"
"dataset_name": "prehistory"
"description": "فم بعملية التقييم في مجال العلوم الانسانية \n\n"
"include": "_default_template_yaml"
"task": "ammlu_prehistory"
"dataset_name": "professional_accounting"
"description": "فم بعملية التقييم في مجال علوم أخرى \n\n"
"include": "_default_template_yaml"
"task": "ammlu_professional_accounting"
"dataset_name": "professional_law"
"description": "فم بعملية التقييم في مجال العلوم الانسانية \n\n"
"include": "_default_template_yaml"
"task": "ammlu_professional_law"
"dataset_name": "professional_medicine"
"description": "فم بعملية التقييم في مجال علوم أخرى \n\n"
"include": "_default_template_yaml"
"task": "ammlu_professional_medicine"
"dataset_name": "professional_psychology"
"description": "فم بعملية التقييم في مجال العلوم الإجتماعية \n\n"
"include": "_default_template_yaml"
"task": "ammlu_professional_psychology"
"dataset_name": "public_relations"
"description": "فم بعملية التقييم في مجال العلوم الإجتماعية \n\n"
"include": "_default_template_yaml"
"task": "ammlu_public_relations"
"dataset_name": "security_studies"
"description": "فم بعملية التقييم في مجال العلوم الإجتماعية \n\n"
"include": "_default_template_yaml"
"task": "ammlu_security_studies"
"dataset_name": "sociology"
"description": "فم بعملية التقييم في مجال العلوم الإجتماعية \n\n"
"include": "_default_template_yaml"
"task": "ammlu_sociology"
"dataset_name": "us_foreign_policy"
"description": "فم بعملية التقييم في مجال العلوم الإجتماعية \n\n"
"include": "_default_template_yaml"
"task": "ammlu_us_foreign_policy"
"dataset_name": "virology"
"description": "فم بعملية التقييم في مجال علوم أخرى \n\n"
"include": "_default_template_yaml"
"task": "ammlu_virology"
"dataset_name": "world_religions"
"description": "فم بعملية التقييم في مجال العلوم الانسانية \n\n"
"include": "_default_template_yaml"
"task": "ammlu_world_religions"
# ANLI
### Paper
Title: `Adversarial NLI: A New Benchmark for Natural Language Understanding`
Paper Link: https://arxiv.org/abs/1910.14599
Adversarial NLI (ANLI) is a dataset collected via an iterative, adversarial
human-and-model-in-the-loop procedure. It consists of three rounds that progressively
increase in difficulty and complexity, and each question-answer includes annotator-
provided explanations.
Homepage: https://github.com/facebookresearch/anli
### Citation
```
@inproceedings{nie-etal-2020-adversarial,
title = "Adversarial {NLI}: A New Benchmark for Natural Language Understanding",
author = "Nie, Yixin and
Williams, Adina and
Dinan, Emily and
Bansal, Mohit and
Weston, Jason and
Kiela, Douwe",
booktitle = "Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics",
year = "2020",
publisher = "Association for Computational Linguistics",
}
```
### Groups and Tasks
#### Groups
* `anli`: Evaluates `anli_r1`, `anli_r2`, and `anli_r3`
#### Tasks
* `anli_r1`: The data collected adversarially in the first round.
* `anli_r2`: The data collected adversarially in the second round, after training on the previous round's data.
* `anli_r3`: The data collected adversarially in the third round, after training on the previous multiple rounds of data.
### Checklist
For adding novel benchmarks/datasets to the library:
* [x] Is the task an existing benchmark in the literature?
* [x] Have you referenced the original paper that introduced the task?
* [ ] If yes, does the original paper provide a reference implementation? If so, have you checked against the reference implementation and documented how to run such a test?
If other tasks on this dataset are already supported:
* [ ] Is the "Main" variant of this task clearly denoted?
* [x] Have you provided a short sentence in a README on what each new variant adds / evaluates?
* [ ] Have you noted which, if any, published evaluation setups are matched by this variant?
group:
- anli
task: anli_r1
dataset_path: anli
dataset_name: null
output_type: multiple_choice
training_split: train_r1
validation_split: dev_r1
test_split: test_r1
doc_to_text: "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither?\nAnswer:"
# True = entailment
# False = contradiction
# Neither = neutral
doc_to_target: "{{['True', 'Neither', 'False'][label]}}"
doc_to_choice:
- "True"
- "Neither"
- "False"
should_decontaminate: true
doc_to_decontamination_query: premise
metric_list:
- metric: acc
aggregation: mean
higher_is_better: true
metadata:
version: 1.0
include: anli_r1.yaml
task: anli_r2
training_split: train_r2
validation_split: dev_r2
test_split: test_r2
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment