@@ -8,7 +8,7 @@ We’d like your help to test it out! you can help by:
1. Trying out your current workloads on the big-refactor branch, and seeing if anything breaks or is counterintuitive,
2. Porting tasks supported in the previous version of the harness to the new YAML configuration format. Please check out our [task implementation guide](https://github.com/EleutherAI/lm-evaluation-harness/blob/big-refactor/docs/new_task_guide.md) for more information.
If you choose to port a task not yet completed according to [our checklist](https://github.com/EleutherAI/lm-evaluation-harness/blob/big-refactor/lm_eval/tasks/README.md), then you can contribute it by opening a PR containing [Refactor] in the name with:
If you choose to port a task not yet completed according to [our checklist](https://github.com/EleutherAI/lm-evaluation-harness/blob/big-refactor/lm_eval/tasks/README.md), then you can contribute it by opening a PR containing [Refactor] in the name with:
- A shell command to run the task in the `master` branch, and what the score is
- A shell command to run the task in your PR branch to `big-refactor`, and what the resulting score is, to show that we achieve equality between the two implementations.
# note: peft_path can be different than pretrained model path
...
...
@@ -262,7 +264,9 @@ class HuggingFaceAutoLM(BaseLM):
try:
self.model.to(self._device)
except:
print("Failed to place model onto specified device. This may be because the model is quantized via `bitsandbytes`. If the desired GPU is being used, this message is safe to ignore.")
print(
"Failed to place model onto specified device. This may be because the model is quantized via `bitsandbytes`. If the desired GPU is being used, this message is safe to ignore."
)
def_create_auto_model(
self,
...
...
@@ -280,6 +284,7 @@ class HuggingFaceAutoLM(BaseLM):
@@ -12,7 +12,7 @@ from lm_eval.base import MultipleChoiceTask
_CITATION="""
@article{huang2023ceval,
title={C-Eval: A Multi-Level Multi-Discipline Chinese Evaluation Suite for Foundation Models},
title={C-Eval: A Multi-Level Multi-Discipline Chinese Evaluation Suite for Foundation Models},
author={Huang, Yuzhen and Bai, Yuzhuo and Zhu, Zhihao and Zhang, Junlei and Zhang, Jinghan and Su, Tangjun and Liu, Junteng and Lv, Chuancheng and Zhang, Yikai and Lei, Jiayi and Fu, Yao and Sun, Maosong and He, Junxian},
title = "Latent Retrieval for Weakly Supervised Open Domain Question Answering",
author = "Lee, Kenton and
Chang, Ming-Wei and
Toutanova, Kristina",
booktitle = "Proceedings of the 57th Annual Meeting of the Association for Computational Linguistics",
month = jul,
year = "2019",
address = "Florence, Italy",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/P19-1612",
doi = "10.18653/v1/P19-1612",
pages = "6086--6096",
abstract = "Recent work on open domain question answering (QA) assumes strong supervision of the supporting evidence and/or assumes a blackbox information retrieval (IR) system to retrieve evidence candidates. We argue that both are suboptimal, since gold evidence is not always available, and QA is fundamentally different from IR. We show for the first time that it is possible to jointly learn the retriever and reader from question-answer string pairs and without any IR system. In this setting, evidence retrieval from all of Wikipedia is treated as a latent variable. Since this is impractical to learn from scratch, we pre-train the retriever with an Inverse Cloze Task. We evaluate on open versions of five QA datasets. On datasets where the questioner already knows the answer, a traditional IR system such as BM25 is sufficient. On datasets where a user is genuinely seeking an answer, we show that learned retrieval is crucial, outperforming BM25 by up to 19 points in exact match.",
}
"""
classNQOpen(Task):
VERSION=0
DATASET_PATH="nq_open"
DATASET_NAME=None
defhas_training_docs(self):
returnTrue
defhas_validation_docs(self):
returnTrue
defhas_test_docs(self):
returnFalse
deftraining_docs(self):
returnself.dataset["train"]
defvalidation_docs(self):
returnself.dataset["validation"]
deftest_docs(self):
raiseNotImplementedError()
defdoc_to_text(self,doc):
returnf"Q: {doc['question']}\nA:"
defshould_decontaminate(self):
returnTrue
defdoc_to_decontamination_query(self,doc):
returndoc["question"]
defdoc_to_target(self,doc):
return" "+doc["answer"][0]
defconstruct_requests(self,doc,ctx):
"""Uses RequestFactory to construct Requests and returns an iterable of
Requests which will be sent to the LM.
:param doc:
The document as returned from training_docs, validation_docs, or test_docs.
:param ctx: str
The context string, generated by fewshot_context. This includes the natural
language description, as well as the few shot examples, and the question