Commit bd028848 authored by Baber's avatar Baber
Browse files

Merge branch 'main' into metrics

# Conflicts:
#	tests/test_tasks.py
parents 6e48110e 56def33d
......@@ -5,7 +5,7 @@
---
## Latest News 📣
- [2025/07] Added `think_end_token` arg to `hf` (token/str), `vllm` and `sglang` (str) for stripping CoT reasoning traces from models that support it.
- [2025/03] Added support for steering HF models!
- [2025/02] Added [SGLang](https://docs.sglang.ai/) support!
- [2024/09] We are prototyping allowing users of LM Evaluation Harness to create and evaluate on text+image multimodal input, text output tasks, and have just added the `hf-multimodal` and `vllm-vlm` model types and `mmmu` task as a prototype feature. We welcome users to try out this in-progress feature and stress-test it for themselves, and suggest they check out [`lmms-eval`](https://github.com/EvolvingLMMs-Lab/lmms-eval), a wonderful project originally forking off of the lm-evaluation-harness, for a broader range of multimodal tasks, models, and features.
......
......@@ -21,7 +21,11 @@ When subclassing `TemplateAPI`, you need to implement the following methods:
1. `_create_payload`: Creates the JSON payload for API requests.
2. `parse_logprobs`: Parses log probabilities from API responses.
3. `parse_generations`: Parses generated text from API responses.
4. `headers`: Returns the headers for the API request.
Optional Properties:
4. `header`: Returns the headers for the API request.
5. `api_key`: Returns the API key for authentication (if required).
You may also need to override other methods or properties depending on your API's specific requirements.
......@@ -97,6 +101,10 @@ When initializing a `TemplateAPI` instance or a subclass, you can provide severa
- Whether to validate the certificate of the API endpoint (if HTTPS).
- Default is True.
- `header` (dict, optional):
- Custom headers for API requests.
- If not provided, uses `{"Authorization": f"Bearer {self.api_key}"}` by default.
Example usage:
```python
......
......@@ -436,7 +436,10 @@ def cli_evaluate(args: Union[argparse.Namespace, None] = None) -> None:
datasets.config.HF_DATASETS_TRUST_REMOTE_CODE = True
args.model_args = args.model_args + ",trust_remote_code=True"
if isinstance(args.model_args, dict):
args.model_args["trust_remote_code"] = True
else:
args.model_args = args.model_args + ",trust_remote_code=True"
(
eval_logger.info(f"Selected Tasks: {task_names}")
if eval_logger.getEffectiveLevel() >= logging.INFO
......
......@@ -510,7 +510,6 @@ def bootstrap_stderr(
if not os.getenv("DISABLE_MULTIPROC"):
import multiprocessing as mp
pool = mp.Pool(mp.cpu_count())
# this gives a biased estimate of the stderr (i.e w/ the mean, it gives something
# equivalent to stderr calculated without Bessel's correction in the stddev.
# Unfortunately, I haven't been able to figure out what the right correction is
......@@ -522,17 +521,16 @@ def bootstrap_stderr(
from tqdm import tqdm
print("bootstrapping for stddev:", f.__name__)
for bootstrap in tqdm(
pool.imap(
_bootstrap_internal(f, chunk_size),
[(i, xs) for i in range(iters // chunk_size)],
),
total=iters // chunk_size,
):
# sample w replacement
res.extend(bootstrap)
pool.close()
with mp.Pool(mp.cpu_count()) as pool:
for bootstrap in tqdm(
pool.imap(
_bootstrap_internal(f, chunk_size),
[(i, xs) for i in range(iters // chunk_size)],
),
total=iters // chunk_size,
):
# sample w replacement
res.extend(bootstrap)
else:
res = _bootstrap_internal_no_mp(f, xs, iters)
......
......@@ -154,15 +154,23 @@ def simple_evaluate(
"Either 'limit' or 'samples' must be None, but both are not None."
)
_NEEDS_CHAT_TEMPLATE = ("inst", "chat")
if (
(isinstance(model_args, str) and "inst" in model_args.lower())
(
isinstance(model_args, str)
and any(kw in model_args.lower() for kw in _NEEDS_CHAT_TEMPLATE)
)
or (
isinstance(model_args, dict)
and any("inst" in str(v).lower() for v in model_args.values())
and any(
any(kw in str(v).lower() for kw in _NEEDS_CHAT_TEMPLATE)
for v in model_args.values()
)
)
) and not apply_chat_template:
eval_logger.warning(
"Model appears to be an instruct variant but chat template is not applied. Recommend setting `apply_chat_template` (optionally `fewshot_as_multiturn`)."
"Model appears to be an instruct or chat variant but chat template is not applied. "
"Recommend setting `apply_chat_template` (optionally `fewshot_as_multiturn`)."
)
if delete_requests_cache:
......@@ -752,6 +760,7 @@ def evaluate(
samples = (
hash_dict_images(samples)
if os.environ.get("LMEVAL_HASHMM", "1") != "0"
and (hasattr(lm, "MULTIMODAL"))
else samples
)
results_dict["samples"] = dict(samples)
......
......@@ -145,7 +145,7 @@ class MultiChoiceRegexFilter(RegexFilter):
"""
regex_pattern: The basic regex pattern to use. If fails to match, we will use the customized match procedure
- step 1 : We parse the choices between ([A-Z])s then try to find these choices in the response.
- step 2 : We parse the choice with regex :[\s]*([A-?]), where ? varies by number of choices.
- step 2 : We parse the choice with regex: r's*([A-?])', where ? varies by number of choices.
group_select: Selects the (group_select)th match from the findall result.
ignore_case: Ignores the case during step 1 matching
ignore_punctuation: Remove the punctuation during step 1 matching
......
......@@ -135,6 +135,7 @@ class TemplateAPI(TemplateLM):
eos_string: str = None,
# timeout in seconds
timeout: int = 300,
header: Optional[Dict[str, str]] = None,
max_images: int = 1,
**kwargs,
) -> None:
......@@ -152,6 +153,7 @@ class TemplateAPI(TemplateLM):
self.model = model or pretrained
self.base_url = base_url
self.tokenizer = tokenizer
self._header = header
if not isinstance(batch_size, int) and "auto" in batch_size:
eval_logger.warning(
"Automatic batch size is not supported for API models. Defaulting to batch size 1."
......@@ -296,7 +298,7 @@ class TemplateAPI(TemplateLM):
@cached_property
def header(self) -> dict:
"""Override this property to return the headers for the API request."""
return {"Authorization": f"Bearer {self.api_key}"}
return self._header or {"Authorization": f"Bearer {self.api_key}"}
@property
def tokenizer_name(self) -> str:
......@@ -447,6 +449,7 @@ class TemplateAPI(TemplateLM):
async def amodel_call(
self,
session: ClientSession,
sem: asyncio.Semaphore,
messages: Union[List[List[int]], List[str], List[JsonChatStr]],
*,
generate: bool = True,
......@@ -465,6 +468,7 @@ class TemplateAPI(TemplateLM):
**kwargs,
)
cache_method = "generate_until" if generate else "loglikelihood"
acquired = await sem.acquire()
try:
async with session.post(
self.base_url,
......@@ -474,7 +478,8 @@ class TemplateAPI(TemplateLM):
if not response.ok:
error_text = await response.text()
eval_logger.warning(
f"API request failed with error message: {error_text}. Retrying..."
f"API request failed! Status code: {response.status}, "
f"Response text: {error_text}. Retrying..."
)
# raising exception will retry the request
response.raise_for_status()
......@@ -495,11 +500,12 @@ class TemplateAPI(TemplateLM):
self.cache_hook.add_partial(cache_method, cache, res)
return answers
# If the retries also fail
except RetryError:
eval_logger.error(
"API request failed after multiple retries. Please check the API status."
)
return None
except BaseException as e:
eval_logger.error(f"Exception:{repr(e)}, {outputs}, retrying.")
raise e
finally:
if acquired:
sem.release()
def batch_loglikelihood_requests(
self, chunks: Iterable[List[LogLikelihoodInputs]]
......@@ -535,6 +541,7 @@ class TemplateAPI(TemplateLM):
) -> Union[List[List[str]], List[List[Tuple[float, bool]]]]:
ctxlens = ctxlens if ctxlens else [None] * len(requests)
conn = TCPConnector(limit=self._concurrent, ssl=self.verify_certificate)
sem = asyncio.Semaphore(self._concurrent)
async with ClientSession(
connector=conn, timeout=ClientTimeout(total=self.timeout)
) as session:
......@@ -542,12 +549,16 @@ class TemplateAPI(TemplateLM):
stop=stop_after_attempt(self.max_retries),
wait=wait_exponential(multiplier=0.5, min=1, max=10),
reraise=True,
before_sleep=lambda retry_state: eval_logger.info(
f"Retry attempt {retry_state.attempt_number}"
),
)(self.amodel_call)
# Create tasks for each batch of request
tasks = [
asyncio.create_task(
retry_(
session=session,
sem=sem,
messages=message,
cache_keys=cache_key,
generate=generate,
......
......@@ -34,6 +34,7 @@ from lm_eval.models.utils import (
get_dtype,
handle_stop_sequences,
pad_and_concat,
postprocess_generated_text,
stop_sequences_criteria,
)
......@@ -76,6 +77,7 @@ class HFLM(TemplateLM):
device: Optional[str] = "cuda",
dtype: Optional[Union[str, torch.dtype]] = "auto",
softmax_dtype: Optional[Union[str, torch.dtype]] = None,
mixed_precision_dtype: Optional[Union[str, torch.dtype]] = None,
batch_size: Optional[Union[int, str]] = 1,
max_batch_size: Optional[int] = 64,
trust_remote_code: Optional[bool] = False,
......@@ -94,6 +96,9 @@ class HFLM(TemplateLM):
autogptq: Optional[Union[bool, str]] = False,
gptqmodel: Optional[bool] = False,
gguf_file: Optional[str] = None,
# end token for thinking, either the string or int token id.
# splits to get response after this token (if provided).
think_end_token: Union[str, int, None] = None,
**kwargs,
) -> None:
super().__init__()
......@@ -223,6 +228,11 @@ class HFLM(TemplateLM):
self.model.eval()
self.model.tie_weights()
self.think_end_token = (
int(think_end_token)
if (isinstance(think_end_token, str) and think_end_token.isdigit())
else think_end_token
)
self.truncation = truncation
self.logits_cache = logits_cache
self.vocab_size = self.tokenizer.vocab_size
......@@ -247,6 +257,11 @@ class HFLM(TemplateLM):
self.softmax_dtype = (
get_dtype(softmax_dtype) if softmax_dtype is not None else None
)
self.mixed_precision_dtype = (
get_dtype(mixed_precision_dtype)
if mixed_precision_dtype is not None
else None
)
if str(batch_size).startswith("auto"):
batch_size = batch_size.split(":")
......@@ -903,18 +918,23 @@ class HFLM(TemplateLM):
logits returned from the model's decoder
"""
with torch.no_grad():
if attn_mask is not None or labels is not None:
assert attn_mask is not None and labels is not None
assert self.AUTO_MODEL_CLASS == transformers.AutoModelForSeq2SeqLM
return self.model(
input_ids=inps, attention_mask=attn_mask, labels=labels
).logits
else:
assert self.AUTO_MODEL_CLASS in (
transformers.AutoModelForCausalLM,
transformers.AutoModelForVision2Seq,
)
return self.model(inps).logits
with torch.autocast(
device_type=self.device.type,
dtype=self.mixed_precision_dtype,
enabled=self.mixed_precision_dtype is not None,
):
if attn_mask is not None or labels is not None:
assert attn_mask is not None and labels is not None
assert self.AUTO_MODEL_CLASS == transformers.AutoModelForSeq2SeqLM
return self.model(
input_ids=inps, attention_mask=attn_mask, labels=labels
).logits
else:
assert self.AUTO_MODEL_CLASS in (
transformers.AutoModelForCausalLM,
transformers.AutoModelForVision2Seq,
)
return self.model(inps).logits
def _model_generate(self, context, max_length, stop, **generation_kwargs):
# temperature = 0.0 if not set
......@@ -934,14 +954,19 @@ class HFLM(TemplateLM):
stopping_criteria = stop_sequences_criteria(
self.tokenizer, stop, context.shape[1], context.shape[0]
)
return self.model.generate(
input_ids=context,
max_length=max_length,
stopping_criteria=stopping_criteria,
pad_token_id=self.tokenizer.pad_token_id,
use_cache=True,
**generation_kwargs,
)
with torch.autocast(
device_type=self.device.type,
dtype=self.mixed_precision_dtype,
enabled=self.mixed_precision_dtype is not None,
):
return self.model.generate(
input_ids=context,
max_length=max_length,
stopping_criteria=stopping_criteria,
pad_token_id=self.tokenizer.pad_token_id,
use_cache=True,
**generation_kwargs,
)
def _select_cont_toks(
self, logits: torch.Tensor, contlen: int = None, inplen: int = None
......@@ -1411,15 +1436,30 @@ class HFLM(TemplateLM):
if self.backend == "causal":
cont_toks = cont_toks[context_enc.shape[1] :]
s = self.tok_decode(cont_toks)
# Handle integer think_end_token: find last occurrence and strip tokens after it
if isinstance(self.think_end_token, int):
think_token_indices = [
i
for i, token in enumerate(cont_toks)
if token == self.think_end_token
]
if think_token_indices:
cont_toks = cont_toks[think_token_indices[-1] + 1 :]
# use secondary stop seqs to cut off should-have-been-stopped content post-hoc
for term in until:
if len(term) > 0:
# ignore '' separator,
# for seq2seq case where self.tok_decode(self.eot_token_id) = ''
s = s.split(term)[0]
s = self.tok_decode(cont_toks)
# Strip leading whitespace if we removed thinking tokens
if isinstance(self.think_end_token, int):
s = s.lstrip()
# Apply post-processing: remove stop sequences and string-based thinking tokens
s = postprocess_generated_text(
generation=s,
stop=until,
think_end_token=self.think_end_token
if isinstance(self.think_end_token, str)
else None,
)
res.append(s)
self.cache_hook.add_partial("generate_until", (context, gen_kwargs), s)
......
......@@ -16,8 +16,8 @@ eval_logger = logging.getLogger(__name__)
class LocalCompletionsAPI(TemplateAPI):
def __init__(
self,
base_url=None,
tokenizer_backend="huggingface",
base_url: str = None,
tokenizer_backend: str = "huggingface",
**kwargs,
):
super().__init__(
......@@ -108,9 +108,9 @@ class LocalCompletionsAPI(TemplateAPI):
class LocalChatCompletion(LocalCompletionsAPI):
def __init__(
self,
base_url=None,
tokenizer_backend=None,
tokenized_requests=False,
base_url: str = None,
tokenizer_backend: str = None,
tokenized_requests: bool = False,
**kwargs,
):
eval_logger.warning(
......@@ -236,6 +236,7 @@ class OpenAIChatCompletion(LocalChatCompletion):
eval_logger.warning(
"o1 models do not support `stop` and only support temperature=1"
)
super().__init__(
base_url=base_url,
tokenizer_backend=tokenizer_backend,
......
......@@ -11,6 +11,7 @@ from lm_eval.api.registry import register_model
from lm_eval.models.utils import (
Collator,
handle_stop_sequences,
postprocess_generated_text,
)
from lm_eval.utils import (
get_rolling_token_windows,
......@@ -59,6 +60,8 @@ class SGLangLM(TemplateLM):
dp_size: int = 1,
tp_size: int = 1,
prefix_token_id: Optional[int] = None,
# End marker for thinking tags - splits to get response after this token (if provided).
think_end_token: Optional[str] = None,
**kwargs,
):
super().__init__()
......@@ -74,6 +77,7 @@ class SGLangLM(TemplateLM):
"Either context_length or max_model_len may be provided, but not both"
)
# Initialize your sglang model here
self.think_end_token = think_end_token
self._max_length = (
max_model_len if max_model_len is not None else context_length
)
......@@ -263,6 +267,9 @@ class SGLangLM(TemplateLM):
# cache generations
for output, context in zip(cont, context):
generated_text = output.get("text", "")
generated_text = postprocess_generated_text(
generated_text, until, self.think_end_token
)
res.append(generated_text)
self.cache_hook.add_partial(
"generate_until", (context, gen_kwargs), generated_text
......
......@@ -852,3 +852,32 @@ def truncate_tokens(
right_length = max_length - left_length
return tokens[:left_length] + tokens[-right_length:]
return None
def postprocess_generated_text(
generation: str, stop: Union[list[str], str, None], think_end_token: Optional[str]
) -> str:
"""
Post-processes the generated text by stripping stop sequences and optional thinking markers.
Args:
generation (str): The generated text to be processed.
stop (Optional[list[str]]): Stop sequence(s) to remove. Text is truncated
at the first occurrence of any stop sequence.
think_end_token (Optional[str]): Token marking end of thinking section. If provided,
returns only the text after this token (discarding thinking content).
Returns:
str: The processed generation - text before stop sequences and after thinking sections.
"""
if stop:
stop = [stop] if isinstance(stop, str) else stop
for term in stop:
if len(term) > 0:
# ignore '' separator,
# for seq2seq case where self.tok_decode(self.eot_token_id) = ''
generation = generation.split(term)[0]
if think_end_token:
generation = generation.split(think_end_token)[-1].lstrip()
return generation
......@@ -22,6 +22,7 @@ from lm_eval.models.utils import (
Collator,
configure_pad_token,
handle_stop_sequences,
postprocess_generated_text,
undistribute,
)
from lm_eval.utils import (
......@@ -133,7 +134,11 @@ class VLLM(TemplateLM):
device: str = "cuda",
data_parallel_size: int = 1,
lora_local_path: str = None,
enable_thinking: bool = False,
# VLLM: enable thinking tags in the prompt.
enable_thinking: bool = True,
# End marker for thinking tags - splits to get response after this token (if provided).
think_end_token: Optional[str] = None,
max_lora_rank: int = 16,
**kwargs,
):
super().__init__()
......@@ -147,6 +152,7 @@ class VLLM(TemplateLM):
assert max_length is None or max_model_len is None, (
"Either max_length or max_model_len may be provided, but not both"
)
self.think_end_token = think_end_token
self.V1 = os.environ.get("VLLM_USE_V1", "1") != "0"
self._max_length = max_model_len if max_model_len is not None else max_length
self.tensor_parallel_size = int(tensor_parallel_size)
......@@ -167,6 +173,8 @@ class VLLM(TemplateLM):
"quantization": quantization,
"seed": int(seed),
"device": str(device),
"enable_lora": True if lora_local_path else False,
"max_lora_rank": int(max_lora_rank),
}
self.model_args.update(kwargs)
self.batch_size = (
......@@ -627,11 +635,11 @@ class VLLM(TemplateLM):
# cache generations
for output, context in zip(cont, context):
generated_text = output.outputs[0].text
generated_text: str = output.outputs[0].text
# use secondary stop seqs to cut off should-have-been-stopped content post-hoc
for term in until:
if len(term) > 0:
generated_text = generated_text.split(term)[0]
generated_text = postprocess_generated_text(
generated_text, until, self.think_end_token
)
res.append(generated_text)
self.cache_hook.add_partial(
"generate_until", (context, gen_kwargs), generated_text
......
......@@ -45,7 +45,9 @@
| [darija_bench](darija_bench/README.md) | Traditional NLP tasks (Translation, Summariation, etc..) for Moroccan Darija | Moroccan Darija (some MT) |
| [darijahellaswag](darijahellaswag/README.md) | Moroccan Darija version of HellaSwag. | Moroccan Darija (MT) |
| [darijammlu](darijammlu/README.md)| Multiple-choice QA in Moroccan Darija (an Arabic dialect). | Moroccan Darija (MT) |
| [drop](drop/README.md) | Tasks requiring numerical reasoning, reading comprehension, and question answering. | English |
| [drop](drop/README.md) | Tasks requiring numerical reasoning, reading comprehension, and question answering. | English |
| [egyhellaswag](egyhellaswag/README.md) | Egyptian Arabic (Masri) version of HellaSwag. | Egyptian Arabic (MT) |
| [egymmlu](egymmlu/README.md) | Multiple-choice QA in Egyptian Arabic. | Egyptian Arabic (MT) |
| [eq_bench](eq_bench/README.md) | Tasks focused on equality and ethics in question answering and decision-making. | English |
| [eus_exams](eus_exams/README.md) | Tasks based on various professional and academic exams in the Basque language. | Basque |
| [eus_proficiency](eus_proficiency/README.md) | Tasks designed to test proficiency in the Basque language across various topics. | Basque |
......
......@@ -53,4 +53,7 @@ None.
- [ ] Majority voting "without CoT"
### Changelog
no version change: changed dataset to `SaylorTwift/bbh`. Do not expect any change in the results.
- no version change: changed dataset to `SaylorTwift/bbh`. Do not expect any change in the results.
- `bbh_cot_fewshot` v.4.0; 2025-07-14:
- PR #3140. Removed duplicate "Let's think step by step" from the fewshots.
- set target_delimiter to "" as the fewshot samples end with a newline character.
......@@ -2,6 +2,7 @@ dataset_path: SaylorTwift/bbh
output_type: generate_until
test_split: test
doc_to_target: "{{target}}"
target_delimiter: ""
metric_list:
- metric: exact_match
aggregation: mean
......@@ -24,4 +25,4 @@ filter_list:
- function: "take_first"
num_fewshot: 3
metadata:
version: 3.0
version: 4.0
......@@ -26,9 +26,7 @@ fewshot_config:
- Yes
- No'
target: 'Let''s think step by step.
Here in this question, we are told that "Frank T. had no experience with guns,
target: 'Here in this question, we are told that "Frank T. had no experience with guns,
his hand slipped on the barrel of the gun, and the shot went wild." A typical
person would assume that this passage suggests that Frank T. had no intention
of shooting and injuring someone and that the bullet accidentally hit the neighbor''s
......@@ -50,9 +48,7 @@ fewshot_config:
- Yes
- No'
target: 'Let''s think step by step.
Here in this question, we are told that the boss ordered them both to arrive
target: 'Here in this question, we are told that the boss ordered them both to arrive
at the meeting room at the same time and that the motion detector was set up
to be triggered if at least one person appeared in the room at the same time."
A typical person would assume that the person probably meant to say the detector
......@@ -82,9 +78,7 @@ fewshot_config:
- Yes
- No'
target: 'Let''s think step by step.
Here in this question, we are told that "He aims the dart at the low point region."
target: 'Here in this question, we are told that "He aims the dart at the low point region."
A typical person might therefore think George did intentionally hit the low
point region, because he wanted to lift up the spirit of his sister Lena. So
the answer is Yes.'
......
......@@ -26,9 +26,7 @@ fewshot_config:
(E) 07/14/1938
(F) 12/14/1988'
target: 'Let''s think step by step.
If today is Christmas Eve of 1937, then today''s date is December 24, 1937.
target: 'If today is Christmas Eve of 1937, then today''s date is December 24, 1937.
10 days before today is December 14, 1937, that is 12/14/1937. So the answer
is (D).'
- input: 'Tomorrow is 11/12/2019. What is the date one year ago from today in MM/DD/YYYY?
......@@ -44,9 +42,7 @@ fewshot_config:
(D) 11/02/2018
(E) 11/04/2018'
target: 'Let''s think step by step.
If tomorrow is 11/12/2019, then today is 11/11/2019. The date one year ago from
target: 'If tomorrow is 11/12/2019, then today is 11/11/2019. The date one year ago from
today is 11/11/2018. So the answer is (B).'
- input: 'Jane and John married on Jan 2, 1958. It is their 5-year anniversary today.
What is the date tomorrow in MM/DD/YYYY?
......@@ -64,9 +60,7 @@ fewshot_config:
(E) 01/03/1982
(F) 12/03/1960'
target: 'Let''s think step by step.
If Jane and John married on Jan 2, 1958, then and if it is their 5-year anniversary
target: 'If Jane and John married on Jan 2, 1958, then and if it is their 5-year anniversary
today, then today''s date is Jan 2, 1963. The date tomorrow is Jan 3, 1963,
that is 01/03/1963. So the answer is (B).'
include: _cot_fewshot_template_yaml
......
......@@ -23,9 +23,7 @@ fewshot_config:
(B) The counselor took the day off
(C) Ambiguous'
target: 'Let''s think step by step.
Here we need to determine who the pronoun "they" might be referring to. There
target: 'Here we need to determine who the pronoun "they" might be referring to. There
are two possible referents for "they", namely the chief and the counselor. The
verb "told" might be able to help us determine which one is more likely (if
either). Let X be the chief and Y the counselor. The sentence is then of the
......@@ -54,9 +52,7 @@ fewshot_config:
(B) The manager didn''t reply yet
(C) Ambiguous'
target: 'Let''s think step by step.
Here we need to determine who the pronoun "he" might be referring to. There
target: 'Here we need to determine who the pronoun "he" might be referring to. There
are two possible referents for "he", namely the manager and the secretary. The
verbs "sent" and "reply" might be able to help us determine which one is more
likely (if either). Let X be the manager and Y the secretary. The sentence is
......@@ -84,9 +80,7 @@ fewshot_config:
(B) It will be the director''s office
(C) Ambiguous'
target: 'Let''s think step by step.
Here we need to determine who the pronoun "his" might be referring to. There
target: 'Here we need to determine who the pronoun "his" might be referring to. There
are two possible referents for "his", namely Bailey''s and the director''s.
The verb phrase "plan to meet" might be able to help us determine which one
is more likely (if either). Let X be Bailey and Y the director. The sentence
......
......@@ -13,9 +13,7 @@ fewshot_config:
samples:
- input: 'Complete the rest of the sequence, making sure that the parentheses are
closed properly. Input: [ { ['
target: 'Let''s think step by step.
We should process each input one by one and keep track of the stack configuration.
target: 'We should process each input one by one and keep track of the stack configuration.
0: empty stack
......@@ -32,9 +30,7 @@ fewshot_config:
So, we need "]", "}", "]". So the answer is ] } ].'
- input: 'Complete the rest of the sequence, making sure that the parentheses are
closed properly. Input: < > ( ( [ [ ( { } ) [ < > ] ]'
target: 'Let''s think step by step.
We should process each input one by one and keep track of the stack configuration.
target: 'We should process each input one by one and keep track of the stack configuration.
0: empty stack
......@@ -76,9 +72,7 @@ fewshot_config:
- input: 'Complete the rest of the sequence, making sure that the parentheses are
closed properly. Input: < [ < [ { < [ ] < { } > > } ] > { { ( ) } { < [ < >
] > }'
target: 'Let''s think step by step.
We should process each input one by one and keep track of the stack configuration.
target: 'We should process each input one by one and keep track of the stack configuration.
0: empty stack
......
......@@ -25,7 +25,7 @@ fewshot_config:
- valid
- invalid'
target: "Let's think step by step.\n(1) Lesley is a close friend of Fernando:\
target: "(1) Lesley is a close friend of Fernando:\
\ Lesley = friend(Fernando).\n(2) Being a close friend of Fernando or a schoolmate\
\ of Lowell is sufficient for being a great-grandfather of Leroy: If X = friend(Fernando)\
\ OR SCHOOLMATE(Lowell), then X = great-grandfather(Leroy).\nHypothesis: Does\
......@@ -49,7 +49,7 @@ fewshot_config:
- valid
- invalid'
target: "Let's think step by step.\n(1) Whoever is not a great-grandfather of\
target: "(1) Whoever is not a great-grandfather of\
\ Clyde is a stepbrother of Brian: If X = NOT (great-grandfather(Clyde)), then\
\ X = stepbrother(Brian).\n(2): Being an ancestor of Dana is sufficient for\
\ not being a great-grandfather of Clyde: If X = ancestor(Dana), X = NOT (great-grandfather(Clyde)).\n\
......@@ -78,7 +78,7 @@ fewshot_config:
- valid
- invalid'
target: "Let's think step by step.\n(1) Every infrequent user of Paul Mitchell\
target: "(1) Every infrequent user of Paul Mitchell\
\ shampoo is either a rare consumer of Nioxin shampoo or a loyal buyer of Caress\
\ soap, or both: If X = infrequent-user(Paul Mitchell), then X = rare-consumer(Nioxin)\
\ OR X = loyal-buyer(Caress).\n(2): No regular consumer of Lush soap is a rare\
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment