"...git@developer.sourcefind.cn:modelzoo/qwen3_vllm.git" did not exist on "3158fd2ca094660c87dd9ed8f6b8fb9c0e2a925c"
Unverified Commit 027fe14c authored by Lintang Sutawika's avatar Lintang Sutawika Committed by GitHub
Browse files

Merge branch 'big-refactor' into benchmark-scripts

parents 32a70d89 4dfa8aba
...@@ -114,6 +114,8 @@ class LM(abc.ABC): ...@@ -114,6 +114,8 @@ class LM(abc.ABC):
additional_config = {} if additional_config is None else additional_config additional_config = {} if additional_config is None else additional_config
args = utils.simple_parse_args_string(arg_string) args = utils.simple_parse_args_string(arg_string)
args2 = {k: v for k, v in additional_config.items() if v is not None} args2 = {k: v for k, v in additional_config.items() if v is not None}
if args2.get("device") == "mps" or args.get("device") == "mps":
args["dtype"] = "float32"
return cls(**args, **args2) return cls(**args, **args2)
@property @property
......
...@@ -99,7 +99,7 @@ class HFLM(LM): ...@@ -99,7 +99,7 @@ class HFLM(LM):
if not (parallelize or accelerator.num_processes > 1): if not (parallelize or accelerator.num_processes > 1):
# use user-passed device # use user-passed device
device_list = set( device_list = set(
["cuda", "cpu"] ["cuda", "cpu", "mps"]
+ [f"cuda:{i}" for i in range(torch.cuda.device_count())] + [f"cuda:{i}" for i in range(torch.cuda.device_count())]
) )
if device: if device:
...@@ -107,6 +107,10 @@ class HFLM(LM): ...@@ -107,6 +107,10 @@ class HFLM(LM):
device = int(device) device = int(device)
self._device = torch.device(device) self._device = torch.device(device)
eval_logger.info(f"Using device '{device}'") eval_logger.info(f"Using device '{device}'")
if device == "mps":
eval_logger.info(
"MPS is still in beta and only supports float32; setting dtype to float32."
)
else: else:
eval_logger.info("Device not specified") eval_logger.info("Device not specified")
eval_logger.info(f"Cuda Available? {torch.cuda.is_available()}") eval_logger.info(f"Cuda Available? {torch.cuda.is_available()}")
......
# Task-name
### Paper
Title: `Adversarial NLI: A New Benchmark for Natural Language Understanding`
Abstract: `https://arxiv.org/pdf/1910.14599.pdf`
Adversarial NLI (ANLI) is a dataset collected via an iterative, adversarial
human-and-model-in-the-loop procedure. It consists of three rounds that progressively
increase in difficulty and complexity, and each question-answer includes annotator-
provided explanations.
Homepage: `https://github.com/facebookresearch/anli`
### Citation
```
@inproceedings{nie-etal-2020-adversarial,
title = "Adversarial {NLI}: A New Benchmark for Natural Language Understanding",
author = "Nie, Yixin and
Williams, Adina and
Dinan, Emily and
Bansal, Mohit and
Weston, Jason and
Kiela, Douwe",
booktitle = "Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics",
year = "2020",
publisher = "Association for Computational Linguistics",
}
```
### Subtasks
List or describe tasks defined in this folder, and their names here:
* `anli_r1`: The data collected adversarially in the first round.
* `anli_r2`: The data collected adversarially in the second round, after training on the previous round's data.
* `anli_r3`: The data collected adversarially in the third round, after training on the previous multiple rounds of data.
### Checklist
For adding novel benchmarks/datasets to the library:
* [x] Is the task an existing benchmark in the literature?
* [x] Have you referenced the original paper that introduced the task?
* [ ] If yes, does the original paper provide a reference implementation? If so, have you checked against the reference implementation and documented how to run such a test?
If other tasks on this dataset are already supported:
* [ ] Is the "Main" variant of this task clearly denoted?
* [x] Have you provided a short sentence in a README on what each new variant adds / evaluates?
* [ ] Have you noted which, if any, published evaluation setups are matched by this variant?
group: group:
- anli - multiple_choice
- natural_language_inference
- nli
- adverserial
task: anli_r1 task: anli_r1
dataset_path: anli dataset_path: anli
dataset_name: null
output_type: multiple_choice output_type: multiple_choice
training_split: train_r1 training_split: train_r1
validation_split: dev_r1 validation_split: dev_r1
test_split: test_r1 test_split: test_r1
doc_to_text: "{{premise}}\nQuestion: {{hypothesis}}. True, False, or Neither?\nAnswer:" doc_to_text: "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither?\nAnswer:"
doc_to_target: " {{label}}" # this will be cast to an int. # True = entailment
template_aliases: "{% set answer_choices = ['True', 'False', 'Neither'] %}" # False = contradiction
# Neither = neutral
doc_to_target: "{{['True', 'Neither', 'False'][label]}}"
doc_to_choice:
- "True"
- "Neither"
- "False"
should_decontaminate: true
doc_to_decontamination_query: premise
metric_list: metric_list:
- metric: acc - metric: acc
aggregation: mean aggregation: mean
......
include: anli_r1.yaml group:
- multiple_choice
- natural_language_inference
- nli
- adverserial
task: anli_r2 task: anli_r2
dataset_path: anli
dataset_name: null
output_type: multiple_choice
training_split: train_r2 training_split: train_r2
validation_split: dev_r2 validation_split: dev_r2
test_split: test_r2 test_split: test_r2
doc_to_text: "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither?\nAnswer:"
# True = entailment
# False = contradiction
# Neither = neutral
doc_to_target: "{{['True', 'Neither', 'False'][label]}}"
doc_to_choice:
- "True"
- "Neither"
- "False"
should_decontaminate: true
doc_to_decontamination_query: premise
metric_list:
- metric: acc
aggregation: mean
higher_is_better: true
include: anli_r1.yaml group:
- multiple_choice
- natural_language_inference
- nli
- adverserial
task: anli_r3 task: anli_r3
dataset_path: anli
dataset_name: null
output_type: multiple_choice
training_split: train_r3 training_split: train_r3
validation_split: dev_r3 validation_split: dev_r3
test_split: test_r3 test_split: test_r3
doc_to_text: "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither?\nAnswer:"
# True = entailment
# False = contradiction
# Neither = neutral
doc_to_target: "{{['True', 'Neither', 'False'][label]}}"
doc_to_choice:
- "True"
- "Neither"
- "False"
should_decontaminate: true
doc_to_decontamination_query: premise
metric_list:
- metric: acc
aggregation: mean
higher_is_better: true
...@@ -58,7 +58,6 @@ def main(): ...@@ -58,7 +58,6 @@ def main():
ctx = task.fewshot_context( ctx = task.fewshot_context(
doc=doc, doc=doc,
num_fewshot=args.num_fewshot, num_fewshot=args.num_fewshot,
rnd=rnd,
) )
f.write(ctx + "\n") f.write(ctx + "\n")
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment