Commit 6572da7b authored by Baber's avatar Baber
Browse files

nits

parent bd255f61
...@@ -13,25 +13,21 @@ class JudgeFilter(Filter): ...@@ -13,25 +13,21 @@ class JudgeFilter(Filter):
PROMPT = """You are an expert evaluator of question-answering systems. Your task is to determine if a given answer matches the ground truth answer in meaning and accuracy. You should respond with "yes", "no", or "unknown". PROMPT = """You are an expert evaluator of question-answering systems. Your task is to determine if a given answer matches the ground truth answer in meaning and accuracy. You should respond with "yes", "no", or "unknown".
Guidelines for evaluation: Guidelines for evaluation:
1. Focus on semantic meaning rather than exact wording 1. For multiple-choice questions, the answer choice letters are enough to determine correctness
2. Consider numerical accuracy when applicable 2. Focus on semantic meaning rather than exact wording
3. Account for partial answers that contain the correct information plus additional details 3. Consider numerical accuracy when applicable
4. Recognize equivalent phrasings and synonyms 4. Account for partial answers that contain the correct information plus additional details
5. Be lenient with minor grammatical differences 5. Recognize equivalent phrasings and synonyms
6. For multi-part questions, all parts must be correct 6. Be lenient with minor grammatical differences
7. For questions requiring specific units, check unit correctness 7. For multi-part questions, all parts must be correct
8. Respond with "unknown" when: 8. For questions requiring specific units, check unit correctness. However, if the answer is correct in all other aspects, you may overlook minor unit errors
- The answer is ambiguous and could be interpreted multiple ways
- There is insufficient context to determine correctness
- The ground truth is incomplete or unclear
- The comparison requires external knowledge not provided
Input format: Input format:
Question: [The question being asked] Question: [The question being asked]
Answer: [The answer given by the system] Answer: [The answer given by the system]
Ground Truth: [The known correct answer] Ground Truth: [The known correct answer]
Your response must be exactly "yes", "no", or "unknown", with no additional explanation. Your response must be exactly "yes" or "no", with no additional explanation.
Example 1: Example 1:
Question: What is the capital of France? Question: What is the capital of France?
...@@ -46,15 +42,15 @@ class JudgeFilter(Filter): ...@@ -46,15 +42,15 @@ class JudgeFilter(Filter):
Your response: no Your response: no
Example 3: Example 3:
Question: What is the GDP of France in 2023? Question: What is the GDP of France in 2023?\nA. 2 trillion USD\nB. 3.05 trillion USD\nC. 2.5 trillion USD
Answer: The economic output was substantial. Answer: B.
Ground Truth: 3.05 trillion USD Ground Truth: 3.05 trillion USD
Your response: unknown Your response: yes
Your response must be exactly "yes", "no", or "unknown", with no additional explanation! Your response must be exactly "yes" or "no", with no additional explanation!
""" """
def __init__(self, url, model, **kwargs) -> None: def __init__(self, url, model, prompt=None, **kwargs) -> None:
""" """
pass a string `regex` to run `re.compile(r"regex")` on. pass a string `regex` to run `re.compile(r"regex")` on.
`fallback` defines the output returned if no matches for the regex are located. `fallback` defines the output returned if no matches for the regex are located.
...@@ -68,20 +64,23 @@ class JudgeFilter(Filter): ...@@ -68,20 +64,23 @@ class JudgeFilter(Filter):
self.model = LocalChatCompletion( self.model = LocalChatCompletion(
base_url=url, pretrained=model, num_concurrent=2, **kwargs base_url=url, pretrained=model, num_concurrent=2, **kwargs
) )
self.prompt = self.PROMPT if prompt is None else prompt
@staticmethod @staticmethod
def create_message(str) -> list[dict]: def create_message(str) -> dict:
return [{"role": "user", "content": str}] return {"role": "user", "content": str}
def apply(self, resps: list[list[str]], docs: list[dict]) -> list[list[str]]: def apply(self, resps: list[list[str]], docs: list[dict]) -> list[list[str]]:
inputs = [ inputs = [
[
self.create_message( self.create_message(
self.PROMPT self.PROMPT
+ "\n\n" + "\n\n"
+ f"Question: {doc['question']}\nAnswer: {resp}\nGround Truth: {doc['answer']}" + f"Question: {doc['question']}\nAnswer: {resp}\nGround Truth: {doc['answer']}"
) )
]
for resp, doc in zip(resps, docs) for resp, doc in zip(resps, docs)
] ]
res = self.model.simple_async_generate([inputs], gen_kwargs={}) res = self.model.simple_async_generate(inputs, gen_kwargs={})
return [[x] for x in res] return [[x] for x in res]
...@@ -725,7 +725,7 @@ class TemplateAPI(TemplateLM): ...@@ -725,7 +725,7 @@ class TemplateAPI(TemplateLM):
def simple_async_generate( def simple_async_generate(
self, self,
requests: Union[List[List[str]], List[List[dict]]], requests: Union[List[List[str], list[list[dict]]], List[List[dict]]],
gen_kwargs: dict, gen_kwargs: dict,
): ):
results = itertools.chain.from_iterable( results = itertools.chain.from_iterable(
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment