Merge branch 'main' into feature/eval_from_config

601be343 · Baber · d0884a96 · 68c3a811 · 601be343 · 601be343
Commit 601be343 authored Jun 23, 2025 by Baber
20 changed files
--- a/lm_eval/models/vllm_vlms.py
+++ b/lm_eval/models/vllm_vlms.py
@@ -12,6 +12,7 @@ from lm_eval.models.utils import (
    Collator,
    handle_stop_sequences,
    replace_placeholders,
+    resize_image,
    undistribute,
 )
 from lm_eval.models.vllm_causallms import VLLM
@@ -44,8 +45,20 @@ class VLLM_VLM(VLLM):
        interleave: bool = True,
        # TODO<baber>: handle max_images and limit_mm_per_prompt better
        max_images: int = 999,
+        image_width: Optional[int] = None,
+        image_height: Optional[int] = None,
+        image_max_side: Optional[int] = None,
        **kwargs,
    ):
+        self.image_width = image_width
+        self.image_height = image_height
+        self.image_max_side = image_max_side
+        if self.image_max_side and (self.image_width or self.image_height):
+            raise ValueError(
+                "Ambiguous config for image resize: you can not specify both "
+                "image_max_side and (image_width or image_height)"
+            )
+
        if max_images != 999:
            kwargs["limit_mm_per_prompt"] = {"image": max_images}
            eval_logger.info(f"Setting limit_mm_per_prompt[image] to {max_images}")
@@ -93,7 +106,7 @@ class VLLM_VLM(VLLM):
            outputs.append(inputs)
        return outputs

-    def _model_generate(
+    def _multimodal_model_generate(
        self,
        requests: List[List[dict]] = None,
        generate: bool = False,
@@ -205,7 +218,10 @@ class VLLM_VLM(VLLM):
    def generate_until(
        self, requests: List[Instance], disable_tqdm: bool = False
    ) -> List[str]:
-        # TODO: support text-only reqs
+        if requests and len(requests[0].args) < 3:
+            # Fall back to non-multimodal generation.
+            return super().generate_until(requests=requests, disable_tqdm=disable_tqdm)
+
        res = []

        def _collate(x):
@@ -239,7 +255,15 @@ class VLLM_VLM(VLLM):
        for chunk in chunks:
            contexts, all_gen_kwargs, aux_arguments = zip(*chunk)

-            visuals = [arg["visual"] for arg in aux_arguments]
+            visuals = [
+                [
+                    resize_image(
+                        img, self.image_width, self.image_height, self.image_max_side
+                    )
+                    for img in arg["visual"]
+                ]
+                for arg in aux_arguments
+            ]

            if not isinstance(contexts, list):
                contexts = list(
@@ -272,7 +296,7 @@ class VLLM_VLM(VLLM):
                left_truncate_len=max_ctx_len,
            )

-            cont = self._model_generate(
+            cont = self._multimodal_model_generate(
                inputs, stop=until, generate=True, max_tokens=max_gen_toks, **kwargs
            )

@@ -288,3 +312,12 @@ class VLLM_VLM(VLLM):

        pbar.close()
        return res
+
+    def loglikelihood_rolling(self, requests: List[Instance]) -> List[float]:
+        if requests and len(requests[0].args) < 3:
+            # Fall back to non-multimodal generation.
+            return super().loglikelihood_rolling(requests=requests)
+        raise NotImplementedError(
+            "model type `vllm-vlm` does not support loglikelihood_rolling. Use 'vlm' model type for text-only loglikelihood_rolling tasks ",
+            "this is because we do not support measuring the loglikelihood a model assigns to an image.",
+        )
--- a/lm_eval/tasks/README.md
+++ b/lm_eval/tasks/README.md
@@ -5,165 +5,167 @@

 For more information, including a full list of task names and their precise meanings or sources, follow the links provided to the individual README.md files for each subfolder.

-| Task Family                                                              | Description                                                                                                                                                                                                                                                                                                                            | Language(s)                                                                                                                   |
-|--------------------------------------------------------------------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|-------------------------------------------------------------------------------------------------------------------------------|
-| [aclue](aclue/README.md)                                                 | Tasks focusing on ancient Chinese language understanding and cultural aspects.                                                                                                                                                                                                                                                         | Ancient Chinese                                                                                                               |
-| [acp_bench](acpbench/README.md)                                          | Tasks evaluating the reasoning ability about Action, Change, and Planning                                                                                                                                                                                                                                                              | English                                                                                                                       |
-| [aexams](aexams/README.md)                                               | Tasks in Arabic related to various academic exams covering a range of subjects.                                                                                                                                                                                                                                                        | Arabic                                                                                                                        |
-| [agieval](agieval/README.md)                                             | Tasks involving historical data or questions related to history and historical texts.                                                                                                                                                                                                                                                  | English, Chinese                                                                                                              |
-| [anli](anli/README.md)                                                   | Adversarial natural language inference tasks designed to test model robustness.                                                                                                                                                                                                                                                        | English                                                                                                                       |
-| [arabic_leaderboard_complete](arabic_leaderboard_complete/README.md)     | A full version of the tasks in the Open Arabic LLM Leaderboard, focusing on the evaluation of models that reflect the characteristics of Arabic language understanding and comprehension, culture, and heritage. Note that some of these tasks are machine-translated.                                                                 | Arabic (Some MT)                                                                                                              |
-| [arabic_leaderboard_light](arabic_leaderboard_light/README.md)           | A light version of the tasks in the Open Arabic LLM Leaderboard (i.e., 10% samples of the test set in the original benchmarks), focusing on the evaluation of models that reflect the characteristics of Arabic language understanding and comprehension, culture, and heritage. Note that some of these tasks are machine-translated. | Arabic (Some MT)                                                                                                              |
-| [arabicmmlu](arabicmmlu/README.md)                                       | Localized Arabic version of MMLU with multiple-choice questions from 40 subjects.                                                                                                                                                                                                                                                      | Arabic                                                                                                                        |
-| [AraDICE](aradice/README.md)                                             | A collection of multiple tasks carefully designed to evaluate dialectal and cultural capabilities in large language models (LLMs).                                                                                                                                                                                                     | Arabic                                                                                                                        |
-| [arc](arc/README.md)                                                     | Tasks involving complex reasoning over a diverse set of questions.                                                                                                                                                                                                                                                                     | English                                                                                                                       |
-| [arithmetic](arithmetic/README.md)                                       | Tasks involving numerical computations and arithmetic reasoning.                                                                                                                                                                                                                                                                       | English                                                                                                                       |
-| [asdiv](asdiv/README.md)                                                 | Tasks involving arithmetic and mathematical reasoning challenges.                                                                                                                                                                                                                                                                      | English                                                                                                                       |
-| [babi](babi/README.md)                                                   | Tasks designed as question and answering challenges based on simulated stories.                                                                                                                                                                                                                                                        | English                                                                                                                       |
-| [basque_bench](basque_bench/README.md)                                   | Collection of tasks in Basque encompassing various evaluation areas.                                                                                                                                                                                                                                                                   | Basque                                                                                                                        |
-| [basqueglue](basqueglue/README.md)                                       | Tasks designed to evaluate language understanding in Basque language.                                                                                                                                                                                                                                                                  | Basque                                                                                                                        |
-| [bbh](bbh/README.md)                                                     | Tasks focused on deep semantic understanding through hypothesization and reasoning.                                                                                                                                                                                                                                                    | English, German                                                                                                               |
-| [bbq](bbq/README.md)                                                     | A question-answering benchmark designed to measure social biases in language models across various demographic categories and contexts.                                                                                                                                                                                                | English                                                                                                                       |
-| [belebele](belebele/README.md)                                           | Language understanding tasks in a variety of languages and scripts.                                                                                                                                                                                                                                                                    | Multiple (122 languages)                                                                                                      |
-| benchmarks                                                               | General benchmarking tasks that test a wide range of language understanding capabilities.                                                                                                                                                                                                                                              |                                                                                                                               |
-| [bertaqa](bertaqa/README.md)                                             | Local Basque cultural trivia QA tests in English and Basque languages.                                                                                                                                                                                                                                                                 | English, Basque, Basque (MT)                                                                                                  |
-| [bigbench](bigbench/README.md)                                           | Broad tasks from the BIG-bench benchmark designed to push the boundaries of large models.                                                                                                                                                                                                                                              | Multiple                                                                                                                      |
-| [blimp](blimp/README.md)                                                 | Tasks testing grammatical phenomena to evaluate language model's linguistic capabilities.                                                                                                                                                                                                                                              | English                                                                                                                       |
-| [careqa](careqa/README.md)                                               | Multiple choice and open-ended medical question answering based on the Spanish Specialised Healthcare Training (MIR) exams.                                                                                                                                                                                                            | English, Spanish                                                                                                              |
-| [catalan_bench](catalan_bench/README.md)                                 | Collection of tasks in Catalan encompassing various evaluation areas.                                                                                                                                                                                                                                                                  | Catalan                                                                                                                       |
-| [ceval](ceval/README.md)                                                 | Tasks that evaluate language understanding and reasoning in an educational context.                                                                                                                                                                                                                                                    | Chinese                                                                                                                       |
-| [cmmlu](cmmlu/README.md)                                                 | Multi-subject multiple choice question tasks for comprehensive academic assessment.                                                                                                                                                                                                                                                    | Chinese                                                                                                                       |
-| code_x_glue                                                              | Tasks that involve understanding and generating code across multiple programming languages.                                                                                                                                                                                                                                            | Go, Java, JS, PHP, Python, Ruby                                                                                               |
-| [commonsense_qa](commonsense_qa/README.md)                               | CommonsenseQA, a multiple-choice QA dataset for measuring commonsense knowledge.                                                                                                                                                                                                                                                       | English                                                                                                                       |
-| [copal_id](copal_id/README.md)                United States              | Indonesian causal commonsense reasoning dataset that captures local nuances.                                                                                                                                                                                                                                                           | Indonesian                                                                                                                    |
-| [coqa](coqa/README.md)                                                   | Conversational question answering tasks to test dialog understanding.                                                                                                                                                                                                                                                                  | English                                                                                                                       |
-| [crows_pairs](crows_pairs/README.md)                                     | Tasks designed to test model biases in various sociodemographic groups.                                                                                                                                                                                                                                                                | English, French                                                                                                               |
-| csatqa                                                                   | Tasks related to SAT and other standardized testing questions for academic assessment.                                                                                                                                                                                                                                                 | Korean                                                                                                                        |
-| [darija_bench](darija_bench/README.md)                                   | Traditional NLP tasks (Translation, Summariation, etc..) for Moroccan Darija                                                                                                                                                                                                                                                           | Moroccan Darija (some MT)                                                                                                     |
-| [darijahellaswag](darijahellaswag/README.md)                             | Moroccan Darija version of HellaSwag.                                                                                                                                                                                                                                                                                                  | Moroccan Darija (MT)                                                                                                          |
-| [darijammlu](darijammlu/README.md)                                       | Multiple-choice QA in Moroccan Darija (an Arabic dialect).                                                                                                                                                                                                                                                                             | Moroccan Darija (MT)                                                                                                          |
-| [drop](drop/README.md)                                                   | Tasks requiring numerical reasoning, reading comprehension, and question answering.                                                                                                                                                                                                                                                    | English                                                                                                                       |
-| [eq_bench](eq_bench/README.md)                                           | Tasks focused on equality and ethics in question answering and decision-making.                                                                                                                                                                                                                                                        | English                                                                                                                       |
-| [eus_exams](eus_exams/README.md)                                         | Tasks based on various professional and academic exams in the Basque language.                                                                                                                                                                                                                                                         | Basque                                                                                                                        |
-| [eus_proficiency](eus_proficiency/README.md)                             | Tasks designed to test proficiency in the Basque language across various topics.                                                                                                                                                                                                                                                       | Basque                                                                                                                        |
-| [eus_reading](eus_reading/README.md)                                     | Reading comprehension tasks specifically designed for the Basque language.                                                                                                                                                                                                                                                             | Basque                                                                                                                        |
-| [eus_trivia](eus_trivia/README.md)                                       | Trivia and knowledge testing tasks in the Basque language.                                                                                                                                                                                                                                                                             | Basque                                                                                                                        |
-| [evalita_LLM](evalita_llm/README.md)                                     | A native Italian benchmark with diverse tasks formats and multiple prompts.                                                                                                                                                                                                                                                            | Italian                                                                                                                       |
-| [fda](fda/README.md)                                                     | Tasks for extracting key-value pairs from FDA documents to test information extraction.                                                                                                                                                                                                                                                | English                                                                                                                       |
-| [fld](fld/README.md)                                                     | Tasks involving free-form and directed dialogue understanding.                                                                                                                                                                                                                                                                         | English                                                                                                                       |
-| [french_bench](french_bench/README.md)                                   | Set of tasks designed to assess language model performance in French.                                                                                                                                                                                                                                                                  | French                                                                                                                        |
-| [galician_bench](galician_bench/README.md)                               | Collection of tasks in Galician encompassing various evaluation areas.                                                                                                                                                                                                                                                                 | Galician                                                                                                                      |
-| [global_mmlu](global_mmlu/README.md)                                     | Collection of culturally sensitive and culturally agnostic MMLU tasks in 15 languages with human translations or post-edits.                                                                                                                                                                                                           | Multiple (15 languages)                                                                                                       |
-| [glue](glue/README.md)                                                   | General Language Understanding Evaluation benchmark to test broad language abilities.                                                                                                                                                                                                                                                  | English                                                                                                                       |
-| [gpqa](gpqa/README.md)                                                   | Tasks designed for general public question answering and knowledge verification.                                                                                                                                                                                                                                                       | English                                                                                                                       |
-| [gsm8k](gsm8k/README.md)                                                 | A benchmark of grade school math problems aimed at evaluating reasoning capabilities.                                                                                                                                                                                                                                                  | English                                                                                                                       |
-| [groundcocoa](groundcocoa/README.md)                                     | A benchmark evaluating the conditional and compositional reasoning of language models using a grounding task.                                                                                                                                                                                                                          | English                                                                                                                       |
-| [haerae](haerae/README.md)                                               | Tasks focused on assessing detailed factual and historical knowledge.                                                                                                                                                                                                                                                                  | Korean                                                                                                                        |
-| [headqa](headqa/README.md)                                               | A high-level education-based question answering dataset to test specialized knowledge.                                                                                                                                                                                                                                                 | Spanish, English                                                                                                              |
-| [hellaswag](hellaswag/README.md)                                         | Tasks to predict the ending of stories or scenarios, testing comprehension and creativity.                                                                                                                                                                                                                                             | English                                                                                                                       |
-| [hendrycks_ethics](hendrycks_ethics/README.md)                           | Tasks designed to evaluate the ethical reasoning capabilities of models.                                                                                                                                                                                                                                                               | English                                                                                                                       |
-| [hendrycks_math](hendrycks_math/README.md)                               | Mathematical problem-solving tasks to test numerical reasoning and problem-solving.                                                                                                                                                                                                                                                    | English                                                                                                                       |
-| [histoires_morales](histoires_morales/README.md)                         | A dataset of structured narratives that describe normative and norm-divergent actions taken by individuals to accomplish certain intentions in concrete situations.                                                                                                                                                                    | French (Some MT)                                                                                                              |
-| [hrm8k](hrm8k/README.md)                                                 | A challenging bilingual math reasoning benchmark for Korean and English.                                                                                                                                                                                                                                                               | Korean (Some MT), English (Some MT)                                                                                           |
-| [humaneval](humaneval/README.md)                                         | Code generation task that measure functional correctness for synthesizing programs from docstrings.                                                                                                                                                                                                                                    | Python                                                                                                                        |
-| [ifeval](ifeval/README.md)                                               | Interactive fiction evaluation tasks for narrative understanding and reasoning.                                                                                                                                                                                                                                                        | English                                                                                                                       |
-| [inverse_scaling](inverse_scaling/README.md)                             | Multiple-choice tasks from the Inverse Scaling Prize, designed to find settings where larger language models perform worse.                                                                                                                                                                                                            | English                                                                                                                       |
-| [japanese_leaderboard](japanese_leaderboard/README.md)                   | Japanese language understanding tasks to benchmark model performance on various linguistic aspects.                                                                                                                                                                                                                                    | Japanese                                                                                                                      |
-| [jsonschema_bench](jsonschema_bench/README.md)                           | Evaluate the ability of LLMs to generate JSON objects that conform to a given JSON schema, including API, configuration files, and other structured data formats.                                                                                                                                                                      | JSON                                                                                                                          |
-| [kbl](kbl/README.md)                                                     | Korean Benchmark for Legal Language Understanding.                                                                                                                                                                                                                                                                                     | Korean                                                                                                                        |
-| [kmmlu](kmmlu/README.md)                                                 | Knowledge-based multi-subject multiple choice questions for academic evaluation.                                                                                                                                                                                                                                                       | Korean                                                                                                                        |
-| [kobest](kobest/README.md)                                               | A collection of tasks designed to evaluate understanding in Korean language.                                                                                                                                                                                                                                                           | Korean                                                                                                                        |
-| [kormedmcqa](kormedmcqa/README.md)                                       | Medical question answering tasks in Korean to test specialized domain knowledge.                                                                                                                                                                                                                                                       | Korean                                                                                                                        |
-| [lambada](lambada/README.md)                                             | Tasks designed to predict the endings of text passages, testing language prediction skills.                                                                                                                                                                                                                                            | English                                                                                                                       |
-| [lambada_cloze](lambada_cloze/README.md)                                 | Cloze-style LAMBADA dataset.                                                                                                                                                                                                                                                                                                           | English                                                                                                                       |
-| [lambada_multilingual](lambada_multilingual/README.md)                   | Multilingual LAMBADA dataset. This is a legacy version of the multilingual dataset, and users should instead use `lambada_multilingual_stablelm`.                                                                                                                                                                                      | German, English, Spanish, French, Italian                                                                                     |
-| [lambada_multilingual_stablelm](lambada_multilingual_stablelm/README.md) | Multilingual LAMBADA dataset. Users should prefer evaluating on this version of the multilingual dataset instead of on `lambada_multilingual`.                                                                                                                                                                                         | German, English, Spanish, French, Italian, Dutch, Portuguese                                                                  |
-| [leaderboard](leaderboard/README.md)                                     | Task group used by Hugging Face's [Open LLM Leaderboard v2](https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard). Those tasks are static and will not change through time                                                                                                                                          | English                                                                                                                       |
-| [lingoly](lingoly/README.md)                                             | Challenging logical reasoning benchmark in low-resource languages with controls for memorization                                                                                                                                                                                                                                       | English, Multilingual                                                                                                         |
-| [llama3](llama3/README.md)                                               | Evals reproducing those provided by the LLAMA team in the Hugging Face repo (instruct)                                                                                                                                                                                                                                                 | English, Multilingual                                                                                                         |
-| [logiqa](logiqa/README.md)                                               | Logical reasoning tasks requiring advanced inference and deduction.                                                                                                                                                                                                                                                                    | English, Chinese                                                                                                              |
-| [logiqa2](logiqa2/README.md)                                             | Large-scale logical reasoning dataset adapted from the Chinese Civil Service Examination.                                                                                                                                                                                                                                              | English, Chinese                                                                                                              |
-| [longbench](longbench/README.md)                                         | LongBench evaluates language models' ability to understand lengthy texts across multiple tasks and languages.                                                                                                                                                                                                                          | English, Chinese                                                                                                              |
-| [mastermind](mastermind/README.md)                                       | Reasoning benchmark based on the board game of Mastermind.                                                                                                                                                                                                                                                                             | English                                                                                                                       |
-| [mathqa](mathqa/README.md)                                               | Question answering tasks involving mathematical reasoning and problem-solving.                                                                                                                                                                                                                                                         | English                                                                                                                       |
-| [mbpp](mbpp/README.md)                                                   | A benchmark designed to measure the ability to synthesize short Python programs from natural language descriptions.                                                                                                                                                                                                                    | Python                                                                                                                        |
-| [meddialog](meddialog/README.md)                                         | Medical open-ended QA and Question Entailment stemming from the MedDialog dataset.                                                                                                                                                                                                                                                     | English                                                                                                                       |
-| [medtext](medtext/README.md)                                             | Medical open-ended QA from the MedText Clinical Notes dataset.                                                                                                                                                                                                                                                                         | English                                                                                                                       |
-| [mimic_repsum](mimic_repsum/README.md)                                   | Medical report summarization from the MIMIC-III dataset.                                                                                                                                                                                                                                                                               | English                                                                                                                       |
-| [mc_taco](mc_taco/README.md)                                             | Question-answer pairs that require temporal commonsense comprehension.                                                                                                                                                                                                                                                                 | English                                                                                                                       |
-| [med_concepts_qa](med_concepts_qa/README.md)                             | Benchmark for evaluating LLMs on their abilities to interpret medical codes and distinguish between medical concept.                                                                                                                                                                                                                   | English                                                                                                                       |
-| [metabench](metabench/README.md)                                         | Distilled versions of six popular benchmarks which are highly predictive of overall benchmark performance and of a single general ability latent trait.                                                                                                                                                                                | English                                                                                                                       |
-| [mediqa_qa2019](mediqa_qa2019/README.md)                                 | Open-ended healthcare question answering benchmark from the MEDIQA 2019 challenge.                                                                                                                                                                                                                                                     | English                                                                                                                       |
-| medmcqa                                                                  | Medical multiple choice questions assessing detailed medical knowledge.                                                                                                                                                                                                                                                                | English                                                                                                                       |
-| medqa                                                                    | Multiple choice question answering based on the United States Medical License Exams.                                                                                                                                                                                                                                                   |                                                                                                                               |
-| [meqsum](meqsum/README.md)                                               | Healtcare Question Entailment benchmark from the MeqSum dataset.                                                                                                                                                                                                                                                                       |                                                                                                                               |
-| [mgsm](mgsm/README.md)                                                   | Benchmark of multilingual grade-school math problems.                                                                                                                                                                                                                                                                                  | Spanish, French, German, Russian, Chinese, Japanese, Thai, Swahili, Bengali, Telugu                                           |
-| [minerva_math](minerva_math/README.md)                                   | Mathematics-focused tasks requiring numerical reasoning and problem-solving skills.                                                                                                                                                                                                                                                    | English                                                                                                                       |
-| [mlqa](mlqa/README.md)                                                   | MultiLingual Question Answering benchmark dataset for evaluating cross-lingual question answering performance.                                                                                                                                                                                                                         | English, Arabic, German, Spanish, Hindi, Vietnamese, Simplified Chinese                                                       |
-| [mmlu](mmlu/README.md)                                                   | Massive Multitask Language Understanding benchmark for broad domain language evaluation. Several variants are supported.                                                                                                                                                                                                               | English                                                                                                                       |
-| [mmlu_pro](mmlu_pro/README.md)                                           | A refined set of MMLU, integrating more challenging, reasoning-focused questions and expanding the choice set from four to ten options.                                                                                                                                                                                                | English                                                                                                                       |
-| [mmlu-pro-plus](mmlu-pro-plus/README.md)                                 | A new test set for evaluating shortcut learning and higher-order reasoning of LLMs.                                                                                                                                                                                                                                                    | English                                                                                                                       |
-| [mmlu_prox](mmlu_prox/README.md)                                         | A multilingual benchmark that extends MMLU-Pro to multiple typologically diverse languages with human validation.                                                                                                                                                                                                                      | English, Japanese, Chinese, Korean, French, German, Spanish, Portuguese, Swahili, Thai, Arabic, Hindi, Bengali                |
-| [mmlusr](mmlusr/README.md)                                               | Variation of MMLU designed to be more rigorous.                                                                                                                                                                                                                                                                                        | English                                                                                                                       |
-| model_written_evals                                                      | Evaluation tasks auto-generated for evaluating a collection of AI Safety concerns.                                                                                                                                                                                                                                                     |                                                                                                                               |
-| [moral_stories](moral_stories/README.md)                                 | A crowd-sourced dataset of structured narratives that describe normative and norm-divergent actions taken by individuals to accomplish certain intentions in concrete situations.                                                                                                                                                      | English                                                                                                                       |
-| [mts_dialog](mts_dialog/README.md)                                       | Open-ended healthcare QA from the MTS-Dialog dataset.                                                                                                                                                                                                                                                                                  | English                                                                                                                       |
-| [mutual](mutual/README.md)                                               | A retrieval-based dataset for multi-turn dialogue reasoning.                                                                                                                                                                                                                                                                           | English                                                                                                                       |
-| [nq_open](nq_open/README.md)                                             | Open domain question answering tasks based on the Natural Questions dataset.                                                                                                                                                                                                                                                           | English                                                                                                                       |
-| [okapi/arc_multilingual](okapi/arc_multilingual/README.md)               | Tasks that involve reading comprehension and information retrieval challenges.                                                                                                                                                                                                                                                         | Multiple (31 languages) **Machine Translated.**                                                                               |
-| [okapi/hellaswag_multilingual](okapi/hellaswag_multilingual/README.md)   | Tasks that involve reading comprehension and information retrieval challenges.                                                                                                                                                                                                                                                         | Multiple (30 languages) **Machine Translated.**                                                                               |
-| okapi/mmlu_multilingual                                                  | Tasks that involve reading comprehension and information retrieval challenges.                                                                                                                                                                                                                                                         | Multiple (34 languages) **Machine Translated.**                                                                               |
-| [okapi/truthfulqa_multilingual](okapi/truthfulqa_multilingual/README.md) | Tasks that involve reading comprehension and information retrieval challenges.                                                                                                                                                                                                                                                         | Multiple (31 languages) **Machine Translated.**                                                                               |
-| [olaph](olaph/README.md)                                                 | Open-ended medical factuality Question Answering from the OLAPH dataset.                                                                                                                                                                                                                                                               | English                                                                                                                       |
-| [openbookqa](openbookqa/README.md)                                       | Open-book question answering tasks that require external knowledge and reasoning.                                                                                                                                                                                                                                                      | English                                                                                                                       |
-| [paloma](paloma/README.md)                                               | Paloma is a comprehensive benchmark designed to evaluate open language models across a wide range of domains, ranging from niche artist communities to mental health forums on Reddit.                                                                                                                                                 | English                                                                                                                       |
-| [paws-x](paws-x/README.md)                                               | Paraphrase Adversaries from Word Scrambling, focusing on cross-lingual capabilities.                                                                                                                                                                                                                                                   | English, French, Spanish, German, Chinese, Japanese, Korean                                                                   |
-| [pile](pile/README.md)                                                   | Open source language modelling data set that consists of 22 smaller, high-quality datasets.                                                                                                                                                                                                                                            | English                                                                                                                       |
-| [pile_10k](pile_10k/README.md)                                           | The first 10K elements of The Pile, useful for debugging models trained on it.                                                                                                                                                                                                                                                         | English                                                                                                                       |
-| [piqa](piqa/README.md)                                                   | Physical Interaction Question Answering tasks to test physical commonsense reasoning.                                                                                                                                                                                                                                                  | English                                                                                                                       |
-| [polemo2](polemo2/README.md)                                             | Sentiment analysis and emotion detection tasks based on Polish language data.                                                                                                                                                                                                                                                          | Polish                                                                                                                        |
-| [portuguese_bench](portuguese_bench/README.md)                           | Collection of tasks in European Portuguese encompassing various evaluation areas.                                                                                                                                                                                                                                                      | Portuguese                                                                                                                    |
-| [prost](prost/README.md)                                                 | Tasks requiring understanding of professional standards and ethics in various domains.                                                                                                                                                                                                                                                 | English                                                                                                                       |
-| [pubmedqa](pubmedqa/README.md)                                           | Question answering tasks based on PubMed research articles for biomedical understanding.                                                                                                                                                                                                                                               | English                                                                                                                       |
-| [qa4mre](qa4mre/README.md)                                               | Question Answering for Machine Reading Evaluation, assessing comprehension and reasoning.                                                                                                                                                                                                                                              | English                                                                                                                       |
-| [qasper](qasper/README.md)                                               | Question Answering dataset based on academic papers, testing in-depth scientific knowledge.                                                                                                                                                                                                                                            | English                                                                                                                       |
-| [race](race/README.md)                                                   | Reading comprehension assessment tasks based on English exams in China.                                                                                                                                                                                                                                                                | English                                                                                                                       |
-| realtoxicityprompts                                                      | Tasks to evaluate language models for generating text with potential toxicity.                                                                                                                                                                                                                                                         |                                                                                                                               |
-| [ruler](ruler/README.md)                                                 | RULER is a benchmark for testing how well language models handle long pieces of text. Requires custom arg (see readme)                                                                                                                                                                                                                 | English                                                                                                                       |
-| [sciq](sciq/README.md)                                                   | Science Question Answering tasks to assess understanding of scientific concepts.                                                                                                                                                                                                                                                       | English                                                                                                                       |
-| [score](score/README.md)                                                 | Systematic consistency and robustness evaluation for LLMs on 3 datasets(MMLU-Pro, Agi Eval and MATH)                                                                                                                                                                                                                                   | English                                                                                                                       |
-| [scrolls](scrolls/README.md)                                             | Tasks that involve long-form reading comprehension across various domains.                                                                                                                                                                                                                                                             | English                                                                                                                       |
-| [simple_cooccurrence_bias](simple_cooccurrence_bias/README.md)           | A metric that evaluates language models for biases based on stereotypical word associations and co-occurrences in text.                                                                                                                                                                                                                | English                                                                                                                       |
-| [siqa](siqa/README.md)                                                   | Social Interaction Question Answering to evaluate common sense and social reasoning.                                                                                                                                                                                                                                                   | English                                                                                                                       |
-| [spanish_bench](spanish_bench/README.md)                                 | Collection of tasks in Spanish encompassing various evaluation areas.                                                                                                                                                                                                                                                                  | Spanish                                                                                                                       |
-| [squad_completion](squad_completion/README.md)                           | A variant of the SQuAD question answering task designed for zero-shot evaluation of small LMs.                                                                                                                                                                                                                                         | English                                                                                                                       |
-| [squadv2](squadv2/README.md)                                             | Stanford Question Answering Dataset version 2, a reading comprehension benchmark.                                                                                                                                                                                                                                                      | English                                                                                                                       |
-| [storycloze](storycloze/README.md)                                       | Tasks to predict story endings, focusing on narrative logic and coherence.                                                                                                                                                                                                                                                             | English                                                                                                                       |
-| [super_glue](super_glue/README.md)                                       | A suite of challenging tasks designed to test a range of language understanding skills.                                                                                                                                                                                                                                                | English                                                                                                                       |
-| [swag](swag/README.md)                                                   | Situations With Adversarial Generations, predicting the next event in videos.                                                                                                                                                                                                                                                          | English                                                                                                                       |
-| [swde](swde/README.md)                                                   | Information extraction tasks from semi-structured web pages.                                                                                                                                                                                                                                                                           | English                                                                                                                       |
-| [tinyBenchmarks](tinyBenchmarks/README.md)                               | Evaluation of large language models with fewer examples using tiny versions of popular benchmarks.                                                                                                                                                                                                                                     | English                                                                                                                       |
-| [tmmluplus](tmmluplus/README.md)                                         | An extended set of tasks under the TMMLU framework for broader academic assessments.                                                                                                                                                                                                                                                   | Traditional Chinese                                                                                                           |
-| [toxigen](toxigen/README.md)                                             | Tasks designed to evaluate language models on their propensity to generate toxic content.                                                                                                                                                                                                                                              | English                                                                                                                       |
-| [translation](translation/README.md)                                     | Tasks focused on evaluating the language translation capabilities of models.                                                                                                                                                                                                                                                           | Arabic, English, Spanish, Basque, Hindi, Indonesian, Burmese, Russian, Swahili, Telugu, Chinese                               |
-| [triviaqa](triviaqa/README.md)                                           | A large-scale dataset for trivia question answering to test general knowledge.                                                                                                                                                                                                                                                         | English                                                                                                                       |
-| [truthfulqa](truthfulqa/README.md)                                       | A QA task aimed at evaluating the truthfulness and factual accuracy of model responses.                                                                                                                                                                                                                                                | English                                                                                                                       |
-| [turkishmmlu](turkishmmlu/README.md)                                     | A multiple-choice QA test modeled after MMLU, written in Turkish based on Turkish high-school level exams.                                                                                                                                                                                                                             | Turkish                                                                                                                       |
-| [unitxt](unitxt/README.md)                                               | A number of tasks implemented using the unitxt library for flexible, shareable, and reusable data preparation and evaluation for generative AI.                                                                                                                                                                                        | English                                                                                                                       |
-| [unscramble](unscramble/README.md)                                       | Tasks involving the rearrangement of scrambled sentences to test syntactic understanding.                                                                                                                                                                                                                                              | English                                                                                                                       |
-| [webqs](webqs/README.md)                                                 | Web-based question answering tasks designed to evaluate internet search and retrieval.                                                                                                                                                                                                                                                 | English                                                                                                                       |
-| [wikitext](wikitext/README.md)                                           | Tasks based on text from Wikipedia articles to assess language modeling and generation.                                                                                                                                                                                                                                                | English                                                                                                                       |
-| [winogender](winogender/README.md)                                       | A diagnostic dataset that tests for gender bias in coreference resolution by measuring how models associate pronouns with different occupations.                                                                                                                                                                                       | English                                                                                                                       |
-| [winogrande](winogrande/README.md)                                       | A large-scale dataset for coreference resolution, inspired by the Winograd Schema Challenge.                                                                                                                                                                                                                                           | English                                                                                                                       |
-| [wmdp](wmdp/README.md)                                                   | A benchmark with the objective of minimizing performance, based on potentially-sensitive multiple-choice knowledge questions.                                                                                                                                                                                                          | English                                                                                                                       |
-| [wmt2016](wmt2016/README.md)                                             | Tasks from the WMT 2016 shared task, focusing on translation between multiple languages.                                                                                                                                                                                                                                               | English, Czech, German, Finnish, Russian, Romanian, Turkish                                                                   |
-| [wsc273](wsc273/README.md)                                               | The Winograd Schema Challenge, a test of commonsense reasoning and coreference resolution.                                                                                                                                                                                                                                             | English                                                                                                                       |
-| [xcopa](xcopa/README.md)                                                 | Cross-lingual Choice of Plausible Alternatives, testing reasoning in multiple languages.                                                                                                                                                                                                                                               | Estonian, Haitian, Indonesian, Italian, Quechua, Swahili, Tamil, Thai, Turkish, Vietnamese, Chinese                           |
+| Task Family                                                              | Description                                                                                                                                                                                                                                                                                                                            | Language(s)                                                                                                           |
+|--------------------------------------------------------------------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|-----------------------------------------------------------------------------------------------------------------------|
+| [aclue](aclue/README.md)                                                 | Tasks focusing on ancient Chinese language understanding and cultural aspects.                                                                                                                                                                                                                                                         | Ancient Chinese                                                                                                       |
+| [acp_bench](acpbench/README.md) | Tasks evaluating the reasoning ability about Action, Change, and Planning | English |
+| [acp_bench_hard](acpbench/README.md) | Tasks evaluating the reasoning ability about Action, Change, and Planning | English |
+| [aexams](aexams/README.md)                                               | Tasks in Arabic related to various academic exams covering a range of subjects. | Arabic                                                                                                                        |
+| [agieval](agieval/README.md)                                             | Tasks involving historical data or questions related to history and historical texts.                                                                                                                                                                                                                                                  | English, Chinese                                                                                                      |
+| [anli](anli/README.md)                                                   | Adversarial natural language inference tasks designed to test model robustness.                                                                                                                                                                                                                                                        | English                                                                                                               |
+| [arabic_leaderboard_complete](arabic_leaderboard_complete/README.md)     | A full version of the tasks in the Open Arabic LLM Leaderboard, focusing on the evaluation of models that reflect the characteristics of Arabic language understanding and comprehension, culture, and heritage. Note that some of these tasks are machine-translated.                                                                 | Arabic (Some MT)                                                                                                      |
+| [arabic_leaderboard_light](arabic_leaderboard_light/README.md)           | A light version of the tasks in the Open Arabic LLM Leaderboard (i.e., 10% samples of the test set in the original benchmarks), focusing on the evaluation of models that reflect the characteristics of Arabic language understanding and comprehension, culture, and heritage. Note that some of these tasks are machine-translated. | Arabic (Some MT)                                                                                                      |
+| [arabicmmlu](arabicmmlu/README.md)                                       | Localized Arabic version of MMLU with multiple-choice questions from 40 subjects.                                                                                                                                                                                                                                                      | Arabic                                                                                                                |
+| [ArabCulture](arab_culture/README.md) | Benchmark for evaluating modeles' commonsense cultural knowledge across different 13 different Arab Countries. | Arabic |
+[AraDICE](aradice/README.md)                                             | A collection of multiple tasks carefully designed to evaluate dialectal and cultural capabilities in large language models (LLMs).                                                                                                                                                                                                     | Arabic                                                                                                                |
+| [arc](arc/README.md)                                                     | Tasks involving complex reasoning over a diverse set of questions.                                                                                                                                                                                                                                                                     | English                                                                                                               |
+| [arithmetic](arithmetic/README.md)                                       | Tasks involving numerical computations and arithmetic reasoning.                                                                                                                                                                                                                                                                       | English                                                                                                               |
+| [asdiv](asdiv/README.md)                                                 | Tasks involving arithmetic and mathematical reasoning challenges.                                                                                                                                                                                                                                                                      | English                                                                                                               |
+| [babi](babi/README.md)                                                   | Tasks designed as question and answering challenges based on simulated stories.                                                                                                                                                                                                                                                        | English                                                                                                               |
+| [basque_bench](basque_bench/README.md)                                   | Collection of tasks in Basque encompassing various evaluation areas.                                                                                                                                                                                                                                                                   | Basque                                                                                                                |
+| [basqueglue](basqueglue/README.md)                                       | Tasks designed to evaluate language understanding in Basque language.                                                                                                                                                                                                                                                                  | Basque                                                                                                                |
+| [bbh](bbh/README.md)                                                     | Tasks focused on deep semantic understanding through hypothesization and reasoning.                                                                                                                                                                                                                                                    | English, German                                                                                                       |
+| [bbq](bbq/README.md)                                                     | A question-answering benchmark designed to measure social biases in language models across various demographic categories and contexts.                                                                                                                                                                                                  | English                                                                                                               |
+| [belebele](belebele/README.md)                                           | Language understanding tasks in a variety of languages and scripts.                                                                                                                                                                                                                                                                    | Multiple (122 languages)                                                                                              |
+| benchmarks                                                               | General benchmarking tasks that test a wide range of language understanding capabilities.                                                                                                                                                                                                                                              |                                                                                                                       |
+| [bertaqa](bertaqa/README.md)                                             | Local Basque cultural trivia QA tests in English and Basque languages.                                                                                                                                                                                                                                                                 | English, Basque, Basque (MT)                                                                                          |
+| [bigbench](bigbench/README.md)                                           | Broad tasks from the BIG-bench benchmark designed to push the boundaries of large models.                                                                                                                                                                                                                                              | Multiple                                                                                                              |
+| [blimp](blimp/README.md)                                                 | Tasks testing grammatical phenomena to evaluate language model's linguistic capabilities.                                                                                                                                                                                                                                              | English                                                                                                               |
+| [c4](c4/README.md)                                                 | Tasks based on a colossal, cleaned version of Common Crawl's web crawl corpus to assess models' language modeling capabilities.                                                                                                                                                                                                                                              | English                                                                                                               |
+| [careqa](careqa/README.md)                                               | Multiple choice and open-ended medical question answering based on the Spanish Specialised Healthcare Training (MIR) exams.                                                                                                                                                                                                            | English, Spanish                                                                                                       |
+| [catalan_bench](catalan_bench/README.md)                                 | Collection of tasks in Catalan encompassing various evaluation areas.                                                                                                                                                                                                                                                                  | Catalan                                                                                                               |
+| [ceval](ceval/README.md)                                                 | Tasks that evaluate language understanding and reasoning in an educational context.                                                                                                                                                                                                                                                    | Chinese                                                                                                               |
+| [cmmlu](cmmlu/README.md)                                                 | Multi-subject multiple choice question tasks for comprehensive academic assessment.                                                                                                                                                                                                                                                    | Chinese                                                                                                               |
+| code_x_glue                                                              | Tasks that involve understanding and generating code across multiple programming languages.                                                                                                                                                                                                                                            | Go, Java, JS, PHP, Python, Ruby                                                                                       |
+| [commonsense_qa](commonsense_qa/README.md)                               | CommonsenseQA, a multiple-choice QA dataset for measuring commonsense knowledge.                                                                                                                                                                                                                                                       | English                                                                                                               |
+| [copal_id](copal_id/README.md)                United States              | Indonesian causal commonsense reasoning dataset that captures local nuances.                                                                                                                                                                                                                                                           | Indonesian                                                                                                            |
+| [coqa](coqa/README.md)                                                   | Conversational question answering tasks to test dialog understanding.                                                                                                                                                                                                                                                                  | English                                                                                                               |
+| [crows_pairs](crows_pairs/README.md)                                     | Tasks designed to test model biases in various sociodemographic groups.                                                                                                                                                                                                                                                                | English, French                                                                                                       |
+| csatqa                                                                   | Tasks related to SAT and other standardized testing questions for academic assessment.                                                                                                                                                                                                                                                 | Korean                                                                                                                |
+| [darija_bench](darija_bench/README.md) | Traditional NLP tasks (Translation, Summariation, etc..) for Moroccan Darija | Moroccan Darija (some MT) |
+| [darijahellaswag](darijahellaswag/README.md) | Moroccan Darija version of HellaSwag. | Moroccan Darija (MT) |
+| [darijammlu](darijammlu/README.md)| Multiple-choice QA in Moroccan Darija (an Arabic dialect).  | Moroccan Darija (MT) |
+| [drop](drop/README.md)                                                   | Tasks requiring numerical reasoning, reading comprehension, and question answering.                                                                                                                                                                                                                                                    | English                                                                                                               |
+| [eq_bench](eq_bench/README.md)                                           | Tasks focused on equality and ethics in question answering and decision-making.                                                                                                                                                                                                                                                        | English                                                                                                               |
+| [eus_exams](eus_exams/README.md)                                         | Tasks based on various professional and academic exams in the Basque language.                                                                                                                                                                                                                                                         | Basque                                                                                                                |
+| [eus_proficiency](eus_proficiency/README.md)                             | Tasks designed to test proficiency in the Basque language across various topics.                                                                                                                                                                                                                                                       | Basque                                                                                                                |
+| [eus_reading](eus_reading/README.md)                                     | Reading comprehension tasks specifically designed for the Basque language.                                                                                                                                                                                                                                                             | Basque                                                                                                                |
+| [eus_trivia](eus_trivia/README.md)                                       | Trivia and knowledge testing tasks in the Basque language.                                                                                                                                                                                                                                                                             | Basque                                                                                                                |
+| [evalita_LLM](evalita_llm/README.md)                                     | A native Italian benchmark with diverse tasks formats and multiple prompts.                                                                                                                                                                                                                                                            | Italian                                                                                                               |
+| [fda](fda/README.md)                                                     | Tasks for extracting key-value pairs from FDA documents to test information extraction.                                                                                                                                                                                                                                                | English                                                                                                               |
+| [fld](fld/README.md)                                                     | Tasks involving free-form and directed dialogue understanding.                                                                                                                                                                                                                                                                         | English                                                                                                               |
+| [french_bench](french_bench/README.md)                                   | Set of tasks designed to assess language model performance in French.                                                                                                                                                                                                                                                                  | French                                                                                                                |
+| [galician_bench](galician_bench/README.md)                               | Collection of tasks in Galician encompassing various evaluation areas.                                                                                                                                                                                                                                                                 | Galician                                                                                                              |
+| [global_mmlu](global_mmlu/README.md)                                     | Collection of culturally sensitive and culturally agnostic MMLU tasks in 15 languages with human translations or post-edits.                                                                                                                                                                                                           | Multiple (15 languages)                                                                                               |
+| [glue](glue/README.md)                                                   | General Language Understanding Evaluation benchmark to test broad language abilities.                                                                                                                                                                                                                                                  | English                                                                                                               |
+| [gpqa](gpqa/README.md)                                                   | Tasks designed for general public question answering and knowledge verification.                                                                                                                                                                                                                                                       | English                                                                                                               |
+| [gsm8k](gsm8k/README.md)                                                 | A benchmark of grade school math problems aimed at evaluating reasoning capabilities.                                                                                                                                                                                                                                                  | English                                                                                                               |
+| [groundcocoa](groundcocoa/README.md)                                     | A benchmark evaluating the conditional and compositional reasoning of language models using a grounding task. | English                                                                                                                       |
+| [haerae](haerae/README.md)                                               | Tasks focused on assessing detailed factual and historical knowledge.                                                                                                                                                                                                                                                                  | Korean                                                                                                                |
+| [headqa](headqa/README.md)                                               | A high-level education-based question answering dataset to test specialized knowledge.                                                                                                                                                                                                                                                 | Spanish, English                                                                                                      |
+| [hellaswag](hellaswag/README.md)                                         | Tasks to predict the ending of stories or scenarios, testing comprehension and creativity.                                                                                                                                                                                                                                             | English                                                                                                               |
+| [hendrycks_ethics](hendrycks_ethics/README.md)                           | Tasks designed to evaluate the ethical reasoning capabilities of models.                                                                                                                                                                                                                                                               | English                                                                                                               |
+| [hendrycks_math](hendrycks_math/README.md)                               | Mathematical problem-solving tasks to test numerical reasoning and problem-solving.                                                                                                                                                                                                                                                    | English                                                                                                               |
+| [histoires_morales](histoires_morales/README.md)                         | A dataset of structured narratives that describe normative and norm-divergent actions taken by individuals to accomplish certain intentions in concrete situations.                                                                                                                                                                    | French (Some MT)                                                                                                      |
+| [hrm8k](hrm8k/README.md)                                                 | A challenging bilingual math reasoning benchmark for Korean and English.                                                                                                                                                                                                                                                               | Korean (Some MT), English (Some MT)                                                                                   |
+| [humaneval](humaneval/README.md)                                         | Code generation task that measure functional correctness for synthesizing programs from docstrings.                                                                                                                                                                                                                                    | Python                                                                                                                |
+| [ifeval](ifeval/README.md)                                               | Interactive fiction evaluation tasks for narrative understanding and reasoning.                                                                                                                                                                                                                                                        | English                                                                                                               |
+| [inverse_scaling](inverse_scaling/README.md)                             | Multiple-choice tasks from the Inverse Scaling Prize, designed to find settings where larger language models perform worse.                                                                                                                                                                                                            | English                                                                                                               |
+| [japanese_leaderboard](japanese_leaderboard/README.md)                   | Japanese language understanding tasks to benchmark model performance on various linguistic aspects.                                                                                                                                                                                                                                    | Japanese                                                                                                              |
+| [jsonschema_bench](jsonschema_bench/README.md)                           | Evaluate the ability of LLMs to generate JSON objects that conform to a given JSON schema, including API, configuration files, and other structured data formats.                                                                                                                                                                      | JSON                                                                                                                  |
+| [kbl](kbl/README.md)                                                     | Korean Benchmark for Legal Language Understanding.                                                                                                                                                                                                                                                                                     | Korean                                                                                                                |
+| [kmmlu](kmmlu/README.md)                                                 | Knowledge-based multi-subject multiple choice questions for academic evaluation.                                                                                                                                                                                                                                                       | Korean                                                                                                                |
+| [kobest](kobest/README.md)                                               | A collection of tasks designed to evaluate understanding in Korean language.                                                                                                                                                                                                                                                           | Korean                                                                                                                |
+| [kormedmcqa](kormedmcqa/README.md)                                       | Medical question answering tasks in Korean to test specialized domain knowledge.                                                                                                                                                                                                                                                       | Korean                                                                                                                |
+| [lambada](lambada/README.md)                                             | Tasks designed to predict the endings of text passages, testing language prediction skills.                                                                                                                                                                                                                                            | English                                                                                                               |
+| [lambada_cloze](lambada_cloze/README.md)                                 | Cloze-style LAMBADA dataset.                                                                                                                                                                                                                                                                                                           | English                                                                                                               |
+| [lambada_multilingual](lambada_multilingual/README.md)                   | Multilingual LAMBADA dataset. This is a legacy version of the multilingual dataset, and users should instead use `lambada_multilingual_stablelm`.                                                                                                                                                                                      | German, English, Spanish, French, Italian                                                                             |
+| [lambada_multilingual_stablelm](lambada_multilingual_stablelm/README.md) | Multilingual LAMBADA dataset. Users should prefer evaluating on this version of the multilingual dataset instead of on `lambada_multilingual`.                                                                                                                                                                                         | German, English, Spanish, French, Italian, Dutch, Portuguese                                                          |
+| [leaderboard](leaderboard/README.md)                                     | Task group used by Hugging Face's [Open LLM Leaderboard v2](https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard). Those tasks are static and will not change through time                                                                                                                                          | English                                                                                                               |
+| [lingoly](lingoly/README.md)                                             | Challenging logical reasoning benchmark in low-resource languages with controls for memorization                                                                                                                                                                                                                                       | English, Multilingual                                                                                                 |
+| [logiqa](logiqa/README.md)                                               | Logical reasoning tasks requiring advanced inference and deduction.                                                                                                                                                                                                                                                                    | English, Chinese                                                                                                      |
+| [logiqa2](logiqa2/README.md)                                             | Large-scale logical reasoning dataset adapted from the Chinese Civil Service Examination.                                                                                                                                                                                                                                              | English, Chinese                                                                                                      |
+| [mastermind](mastermind/README.md)                                       | Reasoning benchmark based on the board game of Mastermind.                                                                                                                                                                                                                                                                             | English                                                                                                               |
+| [mathqa](mathqa/README.md)                                               | Question answering tasks involving mathematical reasoning and problem-solving.                                                                                                                                                                                                                                                         | English                                                                                                               |
+| [mbpp](mbpp/README.md)                                                   | A benchmark designed to measure the ability to synthesize short Python programs from natural language descriptions.                                                                                                                                                                                                                    | Python                                                                                                                |
+| [meddialog](meddialog/README.md)                                         | Medical open-ended QA and Question Entailment stemming from the MedDialog dataset.                                                                                                                                                                                                                                                     | English                                                                                                               |
+| [medtext](medtext/README.md)                                             | Medical open-ended QA from the MedText Clinical Notes dataset.                                                                                                                                                                                                                                                                         | English                                                                                                               |
+| [mimic_repsum](mimic_repsum/README.md)                                   | Medical report summarization from the MIMIC-III dataset.                                                                                                                                                                                                                                                                               | English                                                                                                               |
+| [mc_taco](mc_taco/README.md)                                             | Question-answer pairs that require temporal commonsense comprehension.                                                                                                                                                                                                                                                                 | English                                                                                                               |
+| [med_concepts_qa](med_concepts_qa/README.md)                             | Benchmark for evaluating LLMs on their abilities to interpret medical codes and distinguish between medical concept.                                                                                                                                                                                                                   | English                                                                                                               |
+| [metabench](metabench/README.md)                                         | Distilled versions of six popular benchmarks which are highly predictive of overall benchmark performance and of a single general ability latent trait.                                                                                                                                                                                | English                                                                                                               |
+| [mediqa_qa2019](mediqa_qa2019/README.md)                                 | Open-ended healthcare question answering benchmark from the MEDIQA 2019 challenge.                                                                                                                                                                                                                                                     | English                                                                                                               |
+| medmcqa                                                                  | Medical multiple choice questions assessing detailed medical knowledge.                                                                                                                                                                                                                                                                | English                                                                                                               |
+| medqa                                                                    | Multiple choice question answering based on the United States Medical License Exams.                                                                                                                                                                                                                                                   |                                                                                                                       |
+| [meqsum](meqsum/README.md)                                               | Healtcare Question Entailment benchmark from the MeqSum dataset.                                                                                                                                                                                                                                                                       |                                                                                                                       |
+| [mgsm](mgsm/README.md)                                                   | Benchmark of multilingual grade-school math problems.                                                                                                                                                                                                                                                                                  | Spanish, French, German, Russian, Chinese, Japanese, Thai, Swahili, Bengali, Telugu                                   |
+| [minerva_math](minerva_math/README.md)                                   | Mathematics-focused tasks requiring numerical reasoning and problem-solving skills.                                                                                                                                                                                                                                                    | English                                                                                                               |
+| [mlqa](mlqa/README.md)                                                   | MultiLingual Question Answering benchmark dataset for evaluating cross-lingual question answering performance.                                                                                                                                                                                                                         | English, Arabic, German, Spanish, Hindi, Vietnamese, Simplified Chinese                                               |
+| [mmlu](mmlu/README.md)                                                   | Massive Multitask Language Understanding benchmark for broad domain language evaluation. Several variants are supported.                                                                                                                                                                                                               | English                                                                                                               |
+| [mmlu_pro](mmlu_pro/README.md)                                           | A refined set of MMLU, integrating more challenging, reasoning-focused questions and expanding the choice set from four to ten options.                                                                                                                                                                                                | English                                                                                                               |
+| [mmlu-pro-plus](mmlu-pro-plus/README.md)                                 | A new test set for evaluating shortcut learning and higher-order reasoning of LLMs.                                                                                                                                                                                                                                                    | English                                                                                                               |
+| [mmlu_prox](mmlu_prox/README.md)                                         | A multilingual benchmark that extends MMLU-Pro to multiple typologically diverse languages with human validation.                                                                                                                                                                                                                      | English, Japanese, Chinese, Korean, French, German, Spanish, Portuguese, Swahili, Thai, Arabic, Hindi, Bengali        |
+| [mmlusr](mmlusr/README.md)                                               | Variation of MMLU designed to be more rigorous.                                                                                                                                                                                                                                                                                        | English                                                                                                               |
+| model_written_evals                                                      | Evaluation tasks auto-generated for evaluating a collection of AI Safety concerns.                                                                                                                                                                                                                                                     |                                                                                                                       |
+| [moral_stories](moral_stories/README.md)                                 | A crowd-sourced dataset of structured narratives that describe normative and norm-divergent actions taken by individuals to accomplish certain intentions in concrete situations.                                                                                                                                                      | English                                                                                                               |
+| [mts_dialog](mts_dialog/README.md)                                       | Open-ended healthcare QA from the MTS-Dialog dataset.                                                                                                                                                                                                                                                                                  | English                                                                                                               |
+| [mutual](mutual/README.md)                                               | A retrieval-based dataset for multi-turn dialogue reasoning.                                                                                                                                                                                                                                                                           | English                                                                                                               |
+| [noreval](noreval/README.md)                                             | A human-created Norwegian language understanding and generation benchmark.                                                                                                                                                                                                                                                             | Norwegian (Bokmål and Nynorsk)                                                                                        |
+| [nq_open](nq_open/README.md)                                             | Open domain question answering tasks based on the Natural Questions dataset.                                                                                                                                                                                                                                                           | English                                                                                                               |
+| [okapi/arc_multilingual](okapi/arc_multilingual/README.md)               | Tasks that involve reading comprehension and information retrieval challenges.                                                                                                                                                                                                                                                         | Multiple (31 languages) **Machine Translated.**                                                                       |
+| [okapi/hellaswag_multilingual](okapi/hellaswag_multilingual/README.md)   | Tasks that involve reading comprehension and information retrieval challenges.                                                                                                                                                                                                                                                         | Multiple (30 languages) **Machine Translated.**                                                                       |
+| okapi/mmlu_multilingual                                                  | Tasks that involve reading comprehension and information retrieval challenges.                                                                                                                                                                                                                                                         | Multiple (34 languages) **Machine Translated.**                                                                       |
+| [okapi/truthfulqa_multilingual](okapi/truthfulqa_multilingual/README.md) | Tasks that involve reading comprehension and information retrieval challenges.                                                                                                                                                                                                                                                         | Multiple (31 languages) **Machine Translated.**                                                                       |
+| [olaph](olaph/README.md)                                                 | Open-ended medical factuality Question Answering from the OLAPH dataset.                                                                                                                                                                                                                                                               | English                                                                                                               |
+| [openbookqa](openbookqa/README.md)                                       | Open-book question answering tasks that require external knowledge and reasoning.                                                                                                                                                                                                                                                      | English                                                                                                               |
+| [paloma](paloma/README.md)                                               | Paloma is a comprehensive benchmark designed to evaluate open language models across a wide range of domains, ranging from niche artist communities to mental health forums on Reddit.                                                                                                                                                 | English                                                                                                               |
+| [paws-x](paws-x/README.md)                                               | Paraphrase Adversaries from Word Scrambling, focusing on cross-lingual capabilities.                                                                                                                                                                                                                                                   | English, French, Spanish, German, Chinese, Japanese, Korean                                                           |
+| [pile](pile/README.md)                                                   | Open source language modelling data set that consists of 22 smaller, high-quality datasets.                                                                                                                                                                                                                                            | English                                                                                                               |
+| [pile_10k](pile_10k/README.md)                                           | The first 10K elements of The Pile, useful for debugging models trained on it.                                                                                                                                                                                                                                                         | English                                                                                                               |
+| [piqa](piqa/README.md)                                                   | Physical Interaction Question Answering tasks to test physical commonsense reasoning.                                                                                                                                                                                                                                                  | English                                                                                                               |
+| [polemo2](polemo2/README.md)                                             | Sentiment analysis and emotion detection tasks based on Polish language data.                                                                                                                                                                                                                                                          | Polish                                                                                                                |
+| [portuguese_bench](portuguese_bench/README.md)                           | Collection of tasks in European Portuguese encompassing various evaluation areas.                                                                                                                                                                                                                                                      | Portuguese                                                                                                            |
+| [prost](prost/README.md)                                                 | Tasks requiring understanding of professional standards and ethics in various domains.                                                                                                                                                                                                                                                 | English                                                                                                               |
+| [pubmedqa](pubmedqa/README.md)                                           | Question answering tasks based on PubMed research articles for biomedical understanding.                                                                                                                                                                                                                                               | English                                                                                                               |
+| [qa4mre](qa4mre/README.md)                                               | Question Answering for Machine Reading Evaluation, assessing comprehension and reasoning.                                                                                                                                                                                                                                              | English                                                                                                               |
+| [qasper](qasper/README.md)                                               | Question Answering dataset based on academic papers, testing in-depth scientific knowledge.                                                                                                                                                                                                                                            | English                                                                                                               |
+| [race](race/README.md)                                                   | Reading comprehension assessment tasks based on English exams in China.                                                                                                                                                                                                                                                                | English                                                                                                               |
+| realtoxicityprompts                                                      | Tasks to evaluate language models for generating text with potential toxicity.                                                                                                                                                                                                                                                         |                                                                                                                       |
+| [ruler](ruler/README.md)                                                 | RULER is a benchmark for testing how well language models handle long pieces of text. Requires custom arg (see readme)                                                                                                                                                                                                                 | English |
+| [sciq](sciq/README.md)                                                   | Science Question Answering tasks to assess understanding of scientific concepts.                                                                                                                                                                                                                                                       | English                                                                                                               |
+| [score](score/README.md)                                                 | Systematic consistency and robustness evaluation for LLMs on 3 datasets(MMLU-Pro, Agi Eval and MATH)                                                                                                                                                                                                                                   | English                                                                                                               |
+| [scrolls](scrolls/README.md)                                             | Tasks that involve long-form reading comprehension across various domains.                                                                                                                                                                                                                                                             | English                                                                                                               |
+| [simple_cooccurrence_bias](simple_cooccurrence_bias/README.md)           | A metric that evaluates language models for biases based on stereotypical word associations and co-occurrences in text.                                                                                                                                                                                                                | English                                                                                                               |
+| [siqa](siqa/README.md)                                                   | Social Interaction Question Answering to evaluate common sense and social reasoning.                                                                                                                                                                                                                                                   | English                                                                                                               |
+| [spanish_bench](spanish_bench/README.md)                                 | Collection of tasks in Spanish encompassing various evaluation areas.                                                                                                                                                                                                                                                                  | Spanish                                                                                                               |
+| [squad_completion](squad_completion/README.md)                           | A variant of the SQuAD question answering task designed for zero-shot evaluation of small LMs.                                                                                                                                                                                                                                         | English                                                                                                               |
+| [squadv2](squadv2/README.md)                                             | Stanford Question Answering Dataset version 2, a reading comprehension benchmark.                                                                                                                                                                                                                                                      | English                                                                                                               |
+| [storycloze](storycloze/README.md)                                       | Tasks to predict story endings, focusing on narrative logic and coherence.                                                                                                                                                                                                                                                             | English                                                                                                               |
+| [super_glue](super_glue/README.md)                                       | A suite of challenging tasks designed to test a range of language understanding skills.                                                                                                                                                                                                                                                | English                                                                                                               |
+| [swag](swag/README.md)                                                   | Situations With Adversarial Generations, predicting the next event in videos.                                                                                                                                                                                                                                                          | English                                                                                                               |
+| [swde](swde/README.md)                                                   | Information extraction tasks from semi-structured web pages.                                                                                                                                                                                                                                                                           | English                                                                                                               |
+| [tinyBenchmarks](tinyBenchmarks/README.md)                               | Evaluation of large language models with fewer examples using tiny versions of popular benchmarks.                                                                                                                                                                                                                                     | English                                                                                                               |
+| [tmmluplus](tmmluplus/README.md)                                         | An extended set of tasks under the TMMLU framework for broader academic assessments.                                                                                                                                                                                                                                                   | Traditional Chinese                                                                                                   |
+| [toxigen](toxigen/README.md)                                             | Tasks designed to evaluate language models on their propensity to generate toxic content.                                                                                                                                                                                                                                              | English                                                                                                               |
+| [translation](translation/README.md)                                     | Tasks focused on evaluating the language translation capabilities of models.                                                                                                                                                                                                                                                           | Arabic, English, Spanish, Basque, Hindi, Indonesian, Burmese, Russian, Swahili, Telugu, Chinese                       |
+| [triviaqa](triviaqa/README.md)                                           | A large-scale dataset for trivia question answering to test general knowledge.                                                                                                                                                                                                                                                         | English                                                                                                               |
+| [truthfulqa](truthfulqa/README.md)                                       | A QA task aimed at evaluating the truthfulness and factual accuracy of model responses.                                                                                                                                                                                                                                                | English                                                                                                               |
+| [turkishmmlu](turkishmmlu/README.md)                                     | A multiple-choice QA test modeled after MMLU, written in Turkish based on Turkish high-school level exams.                                                                                                                                                                                                                             | Turkish                                                                                                               |
+| [unitxt](unitxt/README.md)                                               | A number of tasks implemented using the unitxt library for flexible, shareable, and reusable data preparation and evaluation for generative AI.                                                                                                                                                                                        | English                                                                                                               |
+| [unscramble](unscramble/README.md)                                       | Tasks involving the rearrangement of scrambled sentences to test syntactic understanding.                                                                                                                                                                                                                                              | English                                                                                                               |
+| [webqs](webqs/README.md)                                                 | Web-based question answering tasks designed to evaluate internet search and retrieval.                                                                                                                                                                                                                                                 | English                                                                                                               |
+| [wikitext](wikitext/README.md)                                           | Tasks based on text from Wikipedia articles to assess language modeling and generation.                                                                                                                                                                                                                                                | English                                                                                                               |
+| [winogender](winogender/README.md)                                       | A diagnostic dataset that tests for gender bias in coreference resolution by measuring how models associate pronouns with different occupations.                                                                                                                                                                                         | English                                                                                                               |
+| [winogrande](winogrande/README.md)                                       | A large-scale dataset for coreference resolution, inspired by the Winograd Schema Challenge.                                                                                                                                                                                                                                           | English                                                                                                               |
+| [wmdp](wmdp/README.md)                                                   | A benchmark with the objective of minimizing performance, based on potentially-sensitive multiple-choice knowledge questions.                                                                                                                                                                                                          | English                                                                                                               |
+| [wmt2016](wmt2016/README.md)                                             | Tasks from the WMT 2016 shared task, focusing on translation between multiple languages.                                                                                                                                                                                                                                               | English, Czech, German, Finnish, Russian, Romanian, Turkish                                                           |
+| [wsc273](wsc273/README.md)                                               | The Winograd Schema Challenge, a test of commonsense reasoning and coreference resolution.                                                                                                                                                                                                                                             | English                                                                                                               |
+| [xcopa](xcopa/README.md)                                                 | Cross-lingual Choice of Plausible Alternatives, testing reasoning in multiple languages.                                                                                                                                                                                                                                               | Estonian, Haitian, Indonesian, Italian, Quechua, Swahili, Tamil, Thai, Turkish, Vietnamese, Chinese                   |
 | [xnli](xnli/README.md)                                                   | Cross-Lingual Natural Language Inference to test understanding across different languages.                                                                                                                                                                                                                                             | Arabic, Bulgarian, German, Greek, English, Spanish, French, Hindi, Russian, Swahili, Thai, Turkish, Urdu, Vietnamese, Chinese |
-| [xnli_eu](xnli_eu/README.md)                                             | Cross-lingual Natural Language Inference tasks in Basque.                                                                                                                                                                                                                                                                              | Basque                                                                                                                        |
-| [xquad](xquad/README.md)                                                 | Cross-lingual Question Answering Dataset in multiple languages.                                                                                                                                                                                                                                                                        | Arabic, German, Greek, English, Spanish, Hindi, Romanian, Russian, Thai, Turkish, Vietnamese, Chinese                         |
-| [xstorycloze](xstorycloze/README.md)                                     | Cross-lingual narrative understanding tasks to predict story endings in multiple languages.                                                                                                                                                                                                                                            | Russian, Simplified Chinese, Spanish, Arabic, Hindi, Indonesian, Telugu, Swahili, Basque, Burmese                             |
-| [xwinograd](xwinograd/README.md)                                         | Cross-lingual Winograd schema tasks for coreference resolution in multiple languages.                                                                                                                                                                                                                                                  | English, French, Japanese, Portuguese, Russian, Chinese                                                                       |
+| [xnli_eu](xnli_eu/README.md)                                             | Cross-lingual Natural Language Inference tasks in Basque.                                                                                                                                                                                                                                                                              | Basque                                                                                                                |
+| [xquad](xquad/README.md)                                                 | Cross-lingual Question Answering Dataset in multiple languages.                                                                                                                                                                                                                                                                        | Arabic, German, Greek, English, Spanish, Hindi, Romanian, Russian, Thai, Turkish, Vietnamese, Chinese                 |
+| [xstorycloze](xstorycloze/README.md)                                     | Cross-lingual narrative understanding tasks to predict story endings in multiple languages.                                                                                                                                                                                                                                            | Russian, Simplified Chinese, Spanish, Arabic, Hindi, Indonesian, Telugu, Swahili, Basque, Burmese                     |
+| [xwinograd](xwinograd/README.md)                                         | Cross-lingual Winograd schema tasks for coreference resolution in multiple languages.                                                                                                                                                                                                                                                  | English, French, Japanese, Portuguese, Russian, Chinese                                                               |

 ## Multimodal Tasks
 | Task Family                  | Description                                                                                             | Modality    |

--- a/lm_eval/tasks/aclue/_generate_configs.py
+++ b/lm_eval/tasks/aclue/_generate_configs.py
@@ -3,12 +3,14 @@ Take in a YAML, and output all other splits with this YAML
 """

 import argparse
+import logging
 import os

 import yaml
 from tqdm import tqdm

-from lm_eval.utils import eval_logger
+
+eval_logger = logging.getLogger(__name__)


 SUBJECTS = {

--- a/lm_eval/tasks/acpbench/README.md
+++ b/lm_eval/tasks/acpbench/README.md
 # ACPBench

-### Paper
+**Homepage:** https://ibm.github.io/ACPBench/

-Title: ACPBench: Reasoning About Action, Change, and Planning
-Abstract: https://arxiv.org/pdf/2410.05669
+### Papers
+
+**Title:** ACPBench: Reasoning About Action, Change, and Planning
+**Pdf:** https://arxiv.org/pdf/2410.05669
+**Task:** `acp_bench`
+**Abstract:**

 There is an increasing body of work using Large Language Models (LLMs) as agents for orchestrating workflows and making decisions in domains that require planning and multi-step reasoning. As a result, it is imperative to evaluate LMs on core skills required for planning. ACPBench is a benchmark for evaluating the reasoning tasks in the field of planning. The benchmark consists of 7 reasoning tasks over 13 planning domains. The collection is constructed from planning domains described in a formal language. This allows the synthesized problems to have provably correct solutions across many tasks and domains. Further, it allows the luxury to scale without additional human effort, i.e., many additional problems can be created automatically.

-Homepage: https://ibm.github.io/ACPBench/
+
+
+**Title:** ACPBench Hard: Unrestrained Reasoning about Action, Change, and Planning
+**Pdf:** https://arxiv.org/abs/2503.24378
+**Task:** `acp_bench_hard`
+**Abstract:**
+
+We introduce ACPBench Hard, a dataset of generative, open-ended questions which LLM models needs to answer in order to plan. Models that perform well on these tasks could in principle be integrated into a planner or be used directly as a policy. We discuss the complexity of these tasks as well as the complexity of validating the correctness of their answers and present validation algorithms for each task. Equipped with these validators, we test the performance of a variety of models on our tasks and find that for most of these tasks, the performance of even the largest models is still subpar. Our experiments show that no model outperforms any other in these tasks, and with a few exceptions, all tested language models score below 65\%, indicating that even the current frontier language models as well as so-called reasoning models have a long way to go before they can reliably reason about planning.
+
+The dataset is available on [HuggingFace](https://huggingface.co/datasets/ibm-research/acp_bench).


 ### Citation
@@ -23,6 +36,19 @@ Homepage: https://ibm.github.io/ACPBench/
  publisher    = {{AAAI} Press},
  year         = {2025}
 }
+
+@misc{KokelKSS25ACPHard,
+  title       = {ACPBench Hard: Unrestrained Reasoning about Action, Change, and Planning},
+  author      = {Harsha Kokel and
+                 Michael Katz and
+                 Kavitha Srinivas and
+                 Shirin Sohrabi},
+  year        = {2025},
+  eprint      = {2503.24378},
+  archivePrefix = {arXiv},
+  primaryClass  = {cs.AI},
+  url         = {https://arxiv.org/abs/2503.24378},
+}
 ```

 ### Groups, Tags, and Tasks
@@ -33,9 +59,13 @@ Homepage: https://ibm.github.io/ACPBench/

 #### Tags

-* `acp_bench` : Evaluates `acp_bool_cot_2shot` and `acp_mcq_cot_2shot`
+* `acp_bench` : Evaluates `acp_bool_cot_2shot` and `acp_mcq_cot_2shot` (Main variant for ACPBench paper)
 * `acp_bool_cot_2shot` : Evaluates `acp_areach_bool`, `acp_app_bool`, `acp_just_bool`, `acp_land_bool`, `acp_prog_bool`, `acp_reach_bool`, `acp_val_bool` with chain-of-thought and 2 shots
 * `acp_mcq_cot_2shot` : Evaluates `acp_areach_mcq`, `acp_app_mcq`, `acp_just_mcq`, `acp_land_mcq`, `acp_prog_mcq`, `acp_reach_mcq`, `acp_val_mcq`  with chain-of-thought and 2 shots
+* `acp_bench_hard` : Evaluates `acp_gen_2shot` (Main variant for ACPBench Hard paper)
+* `acp_gen_2shot` : Evaluates `acp_areach_gen`, `acp_app_gen`, `acp_just_gen`, `acp_land_gen`, `acp_nexta_gen`, `acp_prog_gen`, `acp_reach_gen`, `acp_val_gen` with 2 shots
+* `acp_bench_hard_with_pddl` : Evaluates `acp_gen_2shot_with_pddl`
+* `acp_gen_2shot_with_pddl` : Evaluates `acp_areach_gen_with_pddl`, `acp_app_gen_with_pddl`, `acp_just_gen_with_pddl`, `acp_land_gen_with_pddl`, `acp_nexta_gen_with_pddl`, `acp_prog_gen_with_pddl`, `acp_reach_gen_with_pddl`, `acp_val_gen_with_pddl` with 2 shots

 #### Tasks

@@ -57,6 +87,26 @@ Homepage: https://ibm.github.io/ACPBench/
 * `acp_reach_mcq`
 * `acp_val_mcq`

+8 Generative tasks (with just natural language description in context)
+* `acp_areach_gen`
+* `acp_app_gen`
+* `acp_just_gen`
+* `acp_land_gen`
+* `acp_nexta_gen`
+* `acp_prog_gen`
+* `acp_reach_gen`
+* `acp_val_gen`
+
+and the same 8 generative tasks with natural language as well as the PDDL description of the domain and problem in context.
+* `acp_areach_gen_with_pddl`
+* `acp_app_gen_with_pddl`
+* `acp_just_gen_with_pddl`
+* `acp_land_gen_with_pddl`
+* `acp_nexta_gen_with_pddl`
+* `acp_prog_gen_with_pddl`
+* `acp_reach_gen_with_pddl`
+* `acp_val_gen_with_pddl`
+
 > ! The evaluation scripts are taken from original github https://github.com/IBM/ACPBench


@@ -77,3 +127,4 @@ If other tasks on this dataset are already supported:
 ### Change Log

 * 03/17/2025 Initial Commit
+* 05/13/2025 Adding ACPBench Hard tasks (with and without PDDL)
--- a/lm_eval/tasks/acpbench/gen_2shot/_gen_yaml_2shot
+++ b/lm_eval/tasks/acpbench/gen_2shot/_gen_yaml_2shot
+tag:
+  - acp_gen_2shot
+  - acp_bench_hard
+dataset_path: ibm-research/acp_bench
+test_split: test
+doc_to_target: "{{answer}}"
+output_type: generate_until
+num_fewshot: 2
+generation_kwargs:
+  until:
+    - "\n\n\n\n"
+    - "\n\n"
+    - "**Question**:"
+    - "**Question:**"
+    - "Q:"
+  do_sample: false
+  max_gen_toks: 1000
+  temperature: 0.0
+metadata:
+  version: 1.0
+process_results: !function acp_utils.process_acp_results
+metric_list:
+  - metric: "score"
+    aggregation: mean
+    higher_is_better: True
--- a/lm_eval/tasks/acpbench/gen_2shot/acp_grammar.lark
+++ b/lm_eval/tasks/acpbench/gen_2shot/acp_grammar.lark
+NAME: /[a-zA-Z][a-zA-Z0-9-_]*/
+LPAR : "("
+RPAR : ")"
+LSPAR: "["
+RSPAR: "]"
+COMMA: ","
+WS: /[ \n]/
+
+action_none : "None"
+
+action_name : LPAR NAME (WS NAME)* RPAR
+
+action_list : (action_name WS?)*
+
+prog_list :  action_name* (COMMA action_name)*
+
+progression_list : LSPAR prog_list RSPAR LSPAR prog_list RSPAR
+
+act : action_name | action_none
+
+index: /[0-9]+[0-9]*/
+
+start: action_list
--- a/lm_eval/tasks/acpbench/gen_2shot/acp_utils.py
+++ b/lm_eval/tasks/acpbench/gen_2shot/acp_utils.py
+import json
+import os
+from abc import ABC, abstractmethod
+from collections import defaultdict
+from pathlib import Path
+
+from lm_eval.api.registry import register_filter
+from lm_eval.filters.extraction import RegexFilter
+
+
+try:
+    import tempfile
+
+    import tarski
+    from kstar_planner import planners as kp
+    from lark import Lark
+    from lark.lexer import Token
+    from lark.visitors import Visitor
+    from pddl.core import Problem
+    from pddl.parser.domain import DomainParser
+    from pddl.parser.problem import ProblemParser
+    from tarski.grounding.common import StateVariableLite
+    from tarski.grounding.lp_grounding import LPGroundingStrategy
+    from tarski.io import PDDLReader
+    from tarski.io import fstrips as iofs
+    from tarski.syntax.formulas import is_atom
+    from tarski.syntax.transform.action_grounding import (
+        ground_schema_into_plain_operator_from_grounding,
+    )
+    from tarski.util import SymbolIndex
+except ModuleNotFoundError:
+    raise ModuleNotFoundError(
+        "`lark>=1.1.9`, `tarski[clingo]==0.8.2`, `pddl==0.4.2` and `kstar-planner==1.4.2` are required for evaluating the generative tasks. \
+Please install via pip install lm-eval[acpbench] or pip install -e .[acpbench]",
+    )
+
+
+#########################################################################
+# Grammar
+
+
+GRAMMAR_FILE = os.path.join(
+    os.path.dirname(os.path.abspath(__file__)), "acp_grammar.lark"
+)
+
+
+class ACPBench_Visitor(Visitor):
+    def __init__(self) -> None:
+        super().__init__()
+        self.action_lists = None
+        self.action_names = None
+        self.progression_lists = None
+        self.prog_lists = None
+        self.indexes = None
+
+    def action_list(self, tree):
+        self.action_lists = []
+
+    def prog_list(self, tree):
+        if self.prog_lists is not None:
+            self.progression_lists.append(self.prog_lists)
+        self.prog_lists = []
+
+    def progression_list(self, tree):
+        self.progression_lists = []
+
+    def action_none(self, tree):
+        self.action_names = "None"
+
+    def action_name(self, tree):
+        act_name = "(" + "".join(tree.children[1:-1]) + ")"
+        self.action_names = act_name
+        if self.action_lists is not None:
+            self.action_lists.append(act_name)
+        if self.prog_lists is not None:
+            self.prog_lists.append(act_name)
+
+    def index(self, tree):
+        self.indexes = "".join(tree.children)
+        if not self.indexes.isnumeric():
+            self.indexes = None
+
+
+class ACPGrammarParser(object):
+    def __init__(self, task) -> None:
+        self.task = task
+        with open(GRAMMAR_FILE) as f:
+            grammar = f.read()
+            self.acp_parser = Lark(grammar, start=task, parser="lalr")
+
+    def parse(self, input, debug=False):
+        def ignore_errors(e):
+            if hasattr(e, "token") and e.token.type == "$END":
+                for x in e.expected:
+                    if x != "WS":
+                        e.interactive_parser.feed_token(
+                            Token(x, self.acp_parser.get_terminal(x).pattern.value)
+                        )
+
+            return True
+
+        input = input.replace("\n", "")
+        input = input.strip()
+        try:
+            tree = self.acp_parser.parse(input, on_error=ignore_errors)
+
+            if debug:
+                print(tree)
+            visitor = ACPBench_Visitor()
+            visitor.visit_topdown(tree)
+            if self.task == "action_list":
+                return visitor.action_lists
+            elif self.task == "act":
+                return visitor.action_names
+            elif self.task == "action_name":
+                return visitor.action_names
+            elif self.task == "index":
+                return visitor.indexes
+            elif self.task == "progression_list":
+                if visitor.prog_lists not in visitor.progression_lists:
+                    visitor.progression_lists.append(visitor.prog_lists)
+                return visitor.progression_lists
+        except Exception as e:
+            if debug:
+                print("exception")
+                print(e)
+            return None
+
+
+##############################################################################
+# Utils
+
+
+# Used in next action
+def is_on_optimal_plan(domain, problem, action, opt):
+    with (
+        tempfile.NamedTemporaryFile() as domain_temp,
+        tempfile.NamedTemporaryFile() as problem_temp,
+    ):
+        with open(str(domain_temp.name), "w", encoding="utf8") as file:
+            file.write(domain.lower())
+        with open(str(problem_temp.name), "w", encoding="utf8") as file:
+            file.write(problem.lower())
+
+        # Here, we need to keep the temp files live until the end of the function
+        try:
+            P = STRIPS(str(domain_temp.name), str(problem_temp.name))
+        except Exception:
+            # Unsolvable
+            return False
+
+        a = P.get_action_or_none(action[1:-1])
+        if a is None:
+            return False
+        state = P.init
+        next_state = progress(state, a)
+        if opt is None:
+            # Get an optimal plan cost
+            plans = generate_optimal_plans_for_problem_state(
+                P, state, num_plans=1, timeout=5
+            )
+            opt = len(plans[0]["actions"])
+        else:
+            opt = int(opt)
+
+        # Getting an optimal plan for the next state
+        next_plans = generate_optimal_plans_for_problem_state(
+            P, next_state, num_plans=1, timeout=5
+        )
+        if next_plans is None:
+            return False
+        next_opt = len(next_plans[0]["actions"])
+        return next_opt + 1 == opt
+
+
+# Used in justification
+def is_plan(domain, problem, new_plan):
+    P = get_STRIPS(domain, problem)
+    if P is None:
+        # Unsolvable
+        return False
+
+    # Check if new_plan is a plan
+    current_state = P.init
+    for action in new_plan:
+        applicable_actions = P.get_applicable_actions(current_state)
+        app_actions_list = [f"({a.name.lower()})" for a in applicable_actions]
+        if action.lower() not in app_actions_list:
+            return False
+        a = applicable_actions[app_actions_list.index(action.lower())]
+        current_state = progress(current_state, a)
+    return entails(current_state, P.goal)
+
+
+# Used in action reachability
+def get_action_preconditions(domain, problem, action):
+    P = get_STRIPS(domain, problem)
+
+    assert P is not None, f"Domain\n{domain}\nProblem\n{problem}\nAction: {action}"
+    a = P.get_action_or_none(action[1:-1])
+    if a is None:
+        return a
+
+    return [f"({f})" for f in a.pres]
+
+
+def generate_optimal_plans_for_problem_state(P, state, num_plans, timeout):
+    import tempfile
+
+    with (
+        tempfile.NamedTemporaryFile() as domain_temp,
+        tempfile.NamedTemporaryFile() as problem_temp,
+    ):
+        create_tmp_dom_prob_replace_init(P, state, domain_temp, problem_temp)
+        plans = generate_top_q_plans(
+            domain=str(domain_temp.name),
+            problem=str(problem_temp.name),
+            num_plans=num_plans,
+            quality_bound=1.0,
+            timeout=timeout,
+        )
+        # print(plans)
+        if plans is None or len(plans["plans"]) == 0:
+            return None
+        return plans["plans"]
+
+
+def generate_top_q_plans(domain, problem, num_plans=10, quality_bound=1.0, timeout=30):
+    # print("Running K* planner")
+    plans = kp.plan_unordered_topq(
+        domain_file=Path(domain),
+        problem_file=Path(problem),
+        number_of_plans_bound=num_plans,
+        quality_bound=quality_bound,
+        timeout=timeout,
+    )
+    return plans
+
+
+# Used in (action) reachability
+def is_unsolvable_new_goal(domain, problem, new_goal):
+    goal = extract_goal(problem)
+    new_problem = problem.replace(goal, f"(:goal {new_goal} )")
+    return is_unsolvable(domain, new_problem)
+
+
+def is_unsolvable(domain, problem):
+    with (
+        tempfile.NamedTemporaryFile() as domain_temp,
+        tempfile.NamedTemporaryFile() as problem_temp,
+    ):
+        with open(str(domain_temp.name), "w", encoding="utf8") as file:
+            file.write(str(domain))
+        with open(str(problem_temp.name), "w", encoding="utf8") as file:
+            file.write(str(problem))
+
+        plans = kp.plan_unordered_topq(
+            domain_file=Path(str(domain_temp.name)),
+            problem_file=Path(str(problem_temp.name)),
+            quality_bound=1.0,
+            number_of_plans_bound=1,
+            timeout=3,
+        )
+
+        if len(plans["planner_error"]) > 0:
+            fl = plans["planner_error"].split("\n")[0]
+            print(f"Planner error: {fl}")
+            return False
+        if plans is None or len(plans["plans"]) == 0:
+            return plans["unsolvable"]
+        return False
+
+
+def extract_goal(prob):
+    a = prob.split("(:goal")[1]
+    cp = 1
+    for i, c in enumerate(a):
+        if c == ")":
+            cp -= 1
+        if c == "(":
+            cp += 1
+        if cp == 0:
+            return "(:goal" + a[: i + 1]
+
+    assert False
+
+
+def entails(state, partialstate):
+    return partialstate <= state
+
+
+def progress(state, act):
+    assert entails(state, act.pres), (
+        "Cannot progress with inconsistent state / action precondition:\n\t Action: "
+        + act.name
+        + "\n\t State: \n\t\t"
+        + "\n\t\t".join(state)
+    )
+    return (state - act.dels) | act.adds
+
+
+def regress(state, act):
+    assert len(state & act.dels) == 0, (
+        "Cannot regress with inconsistent state / action delete effect:\n\t Action: "
+        + act.name
+        + "\n\t State: \n\t\t"
+        + "\n\t\t".join(state)
+    )
+    return (state - act.adds) | act.pres
+
+
+def get_STRIPS(domain, problem):
+    with (
+        tempfile.NamedTemporaryFile() as domain_temp,
+        tempfile.NamedTemporaryFile() as problem_temp,
+    ):
+        with open(str(domain_temp.name), "w", encoding="utf8") as file:
+            file.write(domain.lower())
+        with open(str(problem_temp.name), "w", encoding="utf8") as file:
+            file.write(problem.lower())
+
+        try:
+            P = STRIPS(str(domain_temp.name), str(problem_temp.name))
+            return P
+        except Exception as e:
+            print(f"||{e}||")
+            return None
+
+
+def create_tmp_dom_prob_replace_init(P, state, result_domain_file, result_problem_file):
+    d, p = P.PDDL_replace_init_pddl_parser(state)
+    with open(str(result_domain_file.name), "w", encoding="utf8") as file:
+        file.write(str(d))
+    with open(str(result_problem_file.name), "w", encoding="utf8") as file:
+        file.write(str(p))
+
+    return d, p
+
+
+def fix_name(s):
+    # (act param)
+    if "(" == s[0] and ")" == s[-1]:
+        return s[1:-1]
+    # make it space separated
+    s = s.replace(", ", " ").replace(",", " ")
+    # act(param)
+    if "(" in s:
+        assert ")" == s[-1], f"Broken name? {s}"
+        s = s.replace("(", " ").replace(")", "")
+    # act param
+    return s
+
+
+def get_atoms_pddl(d, p, atoms):
+    objs = set()
+    preds = defaultdict(list)
+    for atom in atoms:
+        a = atom.lower().strip().split(" ")
+        args = a[1:]
+        preds[a[0]].append(args)
+        objs |= set(args)
+
+    constants = [o for o in p.objects | d.constants if o.name.lower() in objs]
+    constants_dict = {}
+    for c in constants:
+        constants_dict[c.name.lower()] = c
+    assert len(objs) == len(constants), (
+        f"Could not identify all objects: {objs - set(constants_dict.keys())} not found, {set(constants_dict.keys()) - objs} should not be there"
+    )
+
+    state = []
+    covered_preds = set()
+    for f in d.predicates:
+        name = f.name.lower()
+        if name in preds:
+            covered_preds.add(name)
+            assert len(preds[name][0]) == f.arity, (
+                f"The arity does not match: {preds[name]} vs {f.terms}"
+            )
+            # Going over the lists of objects, adding ground predicate for each
+            for ob in preds[name]:
+                c = [constants_dict[o] for o in ob]
+                state.append(f(*c))
+    assert len(covered_preds) == len(preds.keys()), (
+        f"Covered predicates: \n{sorted(list(covered_preds))} vs \n{sorted(list(preds.keys()))}"
+    )
+    return set(state)
+
+
+class Action:
+    def __init__(self, name, pre, add, delete):
+        self.name = name
+        self.pres = pre
+        self.adds = add
+        self.dels = delete
+
+    def __str__(self):
+        pres = "{" + ", ".join([f"({a})" for a in self.pres]) + "}"
+        adds = "{" + ", ".join([f"({a})" for a in self.adds]) + "}"
+        dels = "{" + ", ".join([f"({a})" for a in self.dels]) + "}"
+
+        return f"< {self.name}, {pres}, {adds}, {dels} >"
+
+    def toJSON(self):
+        return json.dumps(
+            {
+                "name": self.name,
+                "preconditions": [f"({a})" for a in self.pres],
+                "add_effects": [f"({a})" for a in self.adds],
+                "delete_effects": [f"({a})" for a in self.dels],
+            },
+            sort_keys=True,
+            indent=4,
+        )
+
+    def __repr__(self):
+        return self.name
+
+    def __eq__(self, action):
+        return self.name == action.name
+
+    def __hash__(self):
+        return hash(self.name)
+
+
+class STRIPS:
+    def __init__(self, domain, problem):
+        self.domain_file = domain
+        self.problem_file = problem
+        self.reader = PDDLReader(raise_on_error=True)
+        self.reader.parse_domain(domain)
+        self.problem = self.reader.parse_instance(problem)
+        (self.grounded_fluents, init, goal, self.operators, self.grounder) = (
+            self.ground_problem(self.problem)
+        )
+
+        self.fluents = set([fix_name(str(f)) for f in self.grounded_fluents])
+        self.fluents_map = dict()
+        for f in self.grounded_fluents:
+            self.fluents_map[fix_name(str(f))] = f
+        self.init = set([fix_name(str(f)) for f in init])
+        self.goal = set([fix_name(str(f)) for f in goal])
+        self.actions = set()
+        self.action_map = {}
+        self.init_fluents = [self.fluents_map[f] for f in self.init]
+
+        self.static_predicates = [i.name for i in self.grounder.static_symbols]
+        for op in self.operators:
+            act = self.operator_to_action(op)
+            self.actions.add(act)
+            self.action_map[act.name.lower()] = act
+
+    def __str__(self):
+        fluents = "P = {" + ", ".join([f"({a})" for a in self.fluents]) + "}"
+        init = "I = {" + ", ".join([f"({a})" for a in self.init]) + "}"
+        goal = "G = {" + ", ".join([f"({a})" for a in self.goal]) + "}"
+        actions = "A = {" + "\n ".join([a.__str__() for a in self.actions]) + "}"
+        return fluents + ",\n" + init + "\n" + goal + "\n" + actions
+
+    def toJSON(self):
+        actions = [a.toJSON() for a in self.actions]
+        return json.dumps(
+            {
+                "fluents": list(self.fluents),
+                "initial_state": list(self.init),
+                "goal": list(self.goal),
+                "actions": actions,
+            },
+            sort_keys=True,
+            indent=4,
+        )
+
+    def operator_to_action(self, op, check_fluents=True, check_static=False):
+        adds = {
+            fix_name(str(f.atom)) for f in op.effects if isinstance(f, iofs.AddEffect)
+        } & self.fluents
+        dels = {
+            fix_name(str(f.atom)) for f in op.effects if isinstance(f, iofs.DelEffect)
+        } & self.fluents
+        pre = self.fix_pre_name(op.precondition)
+        if check_fluents:
+            pre = pre & self.fluents
+        if check_static:
+            pre = {p for p in pre if p.split()[0] not in self.static_predicates}
+        act = Action(fix_name(str(op)), pre, adds, dels)
+        return act
+
+    def fix_pre_name(self, precondition):
+        if not is_atom(precondition):
+            return {fix_name(str(f)) for f in precondition.subformulas}
+        return {fix_name(str(precondition))}
+
+    def action(self, name):
+        return self.action_map[fix_name(name).lower()]
+
+    def get_action_or_none(self, name):
+        if "(" in name and ")" != name[-1]:
+            return None
+        return self.action_map.get(fix_name(name).lower(), None)
+
+    def fluent(self, name):
+        return fix_name(name)
+
+    def static_symbols(self):
+        return list(self.grounder.static_symbols)
+
+    def fluent_symbols(self):
+        return list(self.grounder.fluent_symbols)
+
+    def get_grounded_atoms(self, symbol):
+        variables = SymbolIndex()
+        lang = symbol.language
+        key = "atom_" + symbol.name
+        model = self.grounder._solve_lp()
+        if (
+            key in model
+        ):  # in case there is no reachable ground state variable from that fluent symbol
+            for binding in model[key]:
+                binding_with_constants = tuple(lang.get(c) for c in binding)
+                variables.add(StateVariableLite(symbol, binding_with_constants))
+        return variables
+
+    def get_applicable_actions(self, s):
+        return [a for a in self.actions if entails(s, a.pres)]
+
+    def ground_problem(self, problem):
+        grounder = LPGroundingStrategy(problem, include_variable_inequalities=True)
+        action_groundings = grounder.ground_actions()
+        operators = []
+        for action_name, groundings in action_groundings.items():
+            action = problem.get_action(action_name)
+            for grounding in groundings:
+                operators.append(
+                    ground_schema_into_plain_operator_from_grounding(action, grounding)
+                )
+
+        grounded_fluents = set(
+            [
+                grounded_fluent.to_atom()
+                for grounded_fluent in grounder.ground_state_variables().objects
+            ]
+        )
+        init = [f for f in problem.init.as_atoms() if f in grounded_fluents]
+        if isinstance(problem.goal, tarski.syntax.Atom):
+            goal = [problem.goal]
+        else:
+            goal = [f for f in problem.goal.subformulas if f in grounded_fluents]
+
+        return (grounded_fluents, init, goal, operators, grounder)
+
+    def get_static(self):
+        static_symbols = self.static_symbols()
+        ret = []
+        for symbol in static_symbols:
+            ret.extend(self.get_grounded_atoms(symbol))
+        return set([fix_name(str(x)) for x in ret])
+
+    def PDDL_replace_init_pddl_parser(self, s):
+        d = DomainParser()(open(self.domain_file, "r").read().lower())
+        p = ProblemParser()(open(self.problem_file, "r").read().lower())
+
+        new_state = get_atoms_pddl(d, p, s | self.get_static())
+
+        new_p = Problem(
+            p.name, domain=d, objects=p.objects, init=new_state, goal=p.goal
+        )
+
+        return d, new_p
+
+
+def parse_ans(response: str, parser: ACPGrammarParser, task: str):
+    return [parser.parse(clean_answer(resp, task)) for resp in response]
+
+
+# def parse_ans(response : str, parser : ACPGrammarParser, task : str):
+#     ans = [parser.parse(clean_answer(resp, task), debug=True) for resp in response]
+#     if any(elem is None for elem in ans) or any(elem is None for elem in ans[0]):
+#         return None
+#     return ans
+
+
+def remove_garbage(s):
+    while True:
+        if s.endswith("."):
+            s = s[:-1]
+        elif s.endswith("\n"):
+            s = s[:-2]
+        else:
+            break
+    return s.rstrip()
+
+
+def compare_str(s1, s2):
+    return remove_garbage(s1).lower() == remove_garbage(s2).lower()
+
+
+def compare(l1, l2):
+    if not isinstance(l1, list):
+        return compare_str(l1, l2)
+    if not isinstance(l2, list):
+        return False
+    for i, v in enumerate(l1):
+        if not compare(v, l2[i]):
+            return False
+    return True
+
+
+def check_prog_response(resp):
+    if (
+        "Positive Effects".lower() in resp.lower()
+        and "Negative Effects".lower() in resp.lower()
+    ):
+        if "[" not in resp:
+            return True
+    return False
+
+
+def clean_answer(resp, task):
+    # Minor cleanup
+    if "progression_gen" in task:
+        # Check for Positive Effects and Negative Effects instead of separation
+        if check_prog_response(resp):
+            # replace **Positive Effects** with "["
+            # replace **Negative Effects** with "] ["
+            # append "]" to the end
+            resp2 = resp.lower()
+            resp2 = resp2.replace("*", "")
+            resp2 = resp2.replace("positive effects", "[")
+            resp2 = resp2.replace("negative effects", "] [")
+            resp2 = resp2 + "]"
+            return resp2
+    if "action_justification_gen" in task:
+        # Check for "simplified plan:"
+        if "simplified plan:" in resp.lower():
+            resp2 = resp.lower()
+            resp2 = resp2.replace("*", "")
+            resp2 = resp2.split("simplified plan:")[1]
+            return resp2
+    return resp
+
+
+def get_grammar_task(task):
+    # print(task)
+    if task == "reachable_atom_gen":
+        return "act"
+    elif task == "progression_gen":
+        return "progression_list"
+    elif task == "validation_gen":
+        return "index"
+    elif task == "reachable_action_gen":
+        return "act"
+    elif task == "action_justification_gen":
+        return "action_list"
+    elif task == "landmarks_gen":
+        return "act"
+    elif task == "goal_closer_gen":
+        return "action_name"
+    elif task == "applicable_actions_gen":
+        return "action_list"
+
+
+##############################################################################
+#  Evaluators
+
+
+def fix_action_name(a):
+    assert a.startswith("(") and a.endswith(")")
+    return "(" + " ".join([x.strip() for x in a[1:-1].split(" ") if len(x) > 0]) + ")"
+
+
+def str_remove_before_first_parentheses(s):
+    if s.startswith("("):
+        return s
+    try:
+        return s[s.index("(") :]
+    except Exception:
+        return ""
+
+
+def str_remove_after_last_parentheses(s):
+    if s.endswith(")"):
+        return s
+
+    i = s.rfind(")")
+
+    if i == -1:
+        return ""
+    return s[: i + 1]
+
+
+def cleanup_answer(ans):
+    if isinstance(ans, str):
+        ans = str_remove_before_first_parentheses(ans)
+        ans = str_remove_after_last_parentheses(ans)
+        ans = ans.lower()
+        ans = (
+            ans.replace(")\n(", ")######(")
+            .replace("),(", ")######(")
+            .replace(") (", ")######(")
+            .split("######")
+        )
+        return ans
+    if isinstance(ans, list):
+        res = []
+        for x in ans:
+            res.extend(cleanup_answer(x))
+        return res
+
+
+def set_equal(ans1, ans2):
+    return set(ans1) == set(ans2)
+
+
+class BaseEvaluator(ABC):
+    def __init__(self) -> None:
+        self.scores = []
+
+    @abstractmethod
+    def get_score(self, ans, doc):
+        pass
+
+    def add_scores(self, scores):
+        self.scores.extend(scores)
+
+    def get_avg_score(self):
+        avg_score = sum(self.scores) / len(self.scores)
+        return avg_score
+
+
+def get_evaluator(group):
+    if group == "applicable_actions_gen":
+        return ApplicabilityEvaluator()
+    elif group == "progression_gen":
+        return ProgressionEvaluator()
+    elif group == "validation_gen":
+        return ValidationEvaluator()
+    elif group == "reachable_atom_gen":
+        return ReachabilityEvaluator()
+    elif group == "goal_closer_gen":
+        return NextActionEvaluator()
+    elif group == "action_justification_gen":
+        return JustificationEvaluator()
+    elif group == "landmarks_gen":
+        return LandmarksEvaluator()
+    elif group == "reachable_action_gen":
+        return ActionReachabilityEvaluator()
+    assert True, f"Group {group} not found"
+
+
+"""
+Action Reachability task: generate a valid action that is not applicable to any reachable state.
+answer: A subset of actions that are known to be unreachable (not an exhaustive set).
+        It is empty only when we *know* that there are no such actions.
+"""
+
+
+class ActionReachabilityEvaluator(BaseEvaluator):
+    def get_score(self, ans, doc):
+        real_answer = doc["answer"]
+        if not real_answer or len(real_answer) == 0:
+            # The correct answer is None
+            self.add_scores(
+                ["none" == x.strip().lower() if x is not None else False for x in ans]
+            )
+        else:
+            for x in ans:
+                if x is None:
+                    self.scores.append(False)
+                    continue
+                action = x.strip().lower()
+                if action in real_answer:
+                    # The answer is in the subset of stored correct answers
+                    self.scores.append(True)
+                    continue
+                prec = get_action_preconditions(
+                    doc["PDDL_domain"].lower(), doc["PDDL_problem"].lower(), action
+                )
+                if prec is None:
+                    # The answer does not correspond to a valid action
+                    self.scores.append(False)
+                else:
+                    # Need to run a planner on a task with the answer action preconditions as the new goal
+                    prec = f"(and {' '.join(prec)})"
+                    self.scores.append(
+                        is_unsolvable_new_goal(
+                            doc["PDDL_domain"].lower(),
+                            doc["PDDL_problem"].lower(),
+                            prec,
+                        )
+                    )
+
+        return self.get_avg_score()
+
+
+"""
+Action Applicability task: generate all actions that are applicable in the current state.
+answer: A set of all applicable actions.
+"""
+
+
+class ApplicabilityEvaluator(BaseEvaluator):
+    def get_score(self, ans, doc):
+        real_answer = doc["answer"]
+        real_answer = [a.lower() for a in real_answer]
+        ans = [[fix_action_name(a) for a in x] if x is not None else None for x in ans]
+
+        # Check if the answer is equal (as a set) to the real stored answer
+        self.add_scores(
+            [
+                set_equal(real_answer, cleanup_answer(x)) if x is not None else False
+                for x in ans
+            ]
+        )
+        return self.get_avg_score()
+
+
+def is_subsequence(plan, new_plan):
+    i = 0
+    for a in plan:
+        if a == new_plan[i]:
+            i += 1
+            if len(new_plan) == i:
+                # Done
+                return True
+    return False
+
+
+def is_subsequence_and_plan(domain, problem, plan, new_plan):
+    if len(plan) <= len(new_plan):
+        return False
+    if not is_subsequence(plan, new_plan):
+        return False
+    return is_plan(domain, problem, new_plan)
+
+
+"""
+Justification task: generate a proper subsequence of the given plan that is also a plan.
+answer: A list of examples of actions that can be removed (ignored in evaluation).
+"""
+
+
+class JustificationEvaluator(BaseEvaluator):
+    def get_score(self, ans, doc):
+        # Sequence of actions (plan) from the question
+        if "inputs" in doc:  # old field name
+            seq = doc["inputs"][19:-147]
+        else:
+            seq = doc["question"][19:-147]
+        seq = seq.replace(") (", ")######(").split("######")
+        for x in ans:
+            if x is None:
+                self.scores.append(False)
+                continue
+            # An answer plan candidate
+            x = [fix_action_name(a) for a in x]
+            if len(x) == 0:
+                # Wrong answer - never an empty sequence
+                self.scores.append(0)
+                continue
+            # Check if the plan candidate from the answer (a) is a proper subsequence of the plan in the question and (b) is a plan.
+            self.scores.append(
+                is_subsequence_and_plan(
+                    doc["PDDL_domain"].lower(), doc["PDDL_problem"].lower(), seq, x
+                )
+            )
+        return self.get_avg_score()
+
+
+"""
+Landmarks task: generate a fact that is a non-trivial landmark for the current state.
+answer: A list of facts that are found to be landmarks and a list of facts that are found to be non-landmarks.
+
+The questions are generated only for cases where all facts either
+    (a) hold in the current state,
+    (b) true in goal,
+    (c) are found to be landmarks, or
+    (d) are found to be non-landmarks.
+In such cases, the evaluation is simple, it does not require checking whether a fact is a landmark, it was
+already done during question generation.
+"""
+
+
+class LandmarksEvaluator(BaseEvaluator):
+    def get_score(self, ans, doc):
+        # The set of facts that are found to be landmarks
+        real_answer = doc["answer"]
+        real_answer_yes = [a.lower() for a in real_answer["yes"]]
+
+        for x in ans:
+            if x is None:
+                self.scores.append(False)
+                continue
+            if x.strip().lower() in real_answer_yes:
+                # The answer fact is known to be landmark
+                self.scores.append(True)
+            elif x.strip().lower() == "none":
+                # The answer is none, correct only if there are no known landmarks,
+                #   since we only generate questions when that means that there are no non-trivial landmarks
+                self.scores.append(len(real_answer_yes) == 0)
+            else:
+                # All other cases the answer is incorrect
+                self.scores.append(False)
+
+        return self.get_avg_score()
+
+
+"""
+Next Action task: generate an action that takes us closer to the goal.
+answer:
+    (a) A list of applicable actions that are known to be correct answers
+    (b) A list of applicable actions that are known to be incorrect answers
+    (c) The rest of the applicable actions (maybe).
+"""
+
+
+class NextActionEvaluator(BaseEvaluator):
+    def get_score(self, ans, doc):
+        real_answer = doc["answer"]
+        real_answer_yes = [a.lower() for a in real_answer["yes"]]
+        real_answer_no = [a.lower() for a in real_answer["no"]]
+        real_answer_maybe = [a.lower() for a in real_answer["maybe"]]
+        # The cost of the optimal plan from the current state
+        opt = real_answer.get("opt", None)
+        for x in ans:
+            if x is None:
+                self.scores.append(False)
+                continue
+            action = x.strip().lower()
+            if action in real_answer_yes:
+                # Known to be correct
+                self.scores.append(True)
+            elif action in real_answer_no:
+                # Known to be incorrect
+                self.scores.append(False)
+            elif action not in real_answer_maybe:
+                # Not applicable, must be incorrect
+                self.scores.append(False)
+            else:
+                # Unknown, need to run a planner to check whether the state that results from applying the action is closer to the goal
+                #  meaning has smaller optimal plan cost.
+                self.scores.append(
+                    is_on_optimal_plan(
+                        doc["PDDL_domain"].lower(),
+                        doc["PDDL_problem"].lower(),
+                        action,
+                        opt,
+                    )
+                )
+
+        return self.get_avg_score()
+
+
+"""
+Progression task: generate the positive and negative effects of an action in the current state.
+answer:
+    (a) A list of facts that were false and become true, when the action is applied
+    (b) A list of facts that were true and become false, when the action is applied
+"""
+
+
+class ProgressionEvaluator(BaseEvaluator):
+    def get_score(self, ans, doc):
+        real_answer = doc["answer"]
+        real_answer_pos = [a.lower() for a in real_answer["pos"]]
+        real_answer_neg = [a.lower() for a in real_answer["neg"]]
+
+        for x in ans:
+            # The answer should be two lists. We allow for a single list and assume that the second one is empty (relaxed evaluation).
+            if x is None or len(x) > 2 or len(x) < 1:
+                self.scores.append(False)
+            else:
+                p = cleanup_answer(x[0])
+                if len(x) == 2:
+                    n = cleanup_answer(x[1])
+                else:
+                    # Assuming the last element is dropped because it is empty
+                    n = []
+                # Check if the answer is equal as sets to the correct answers.
+                ans = [set_equal(real_answer_pos, p), set_equal(real_answer_neg, n)]
+                self.scores.append(all(ans))
+
+        return self.get_avg_score()
+
+
+"""
+Reachability task: generate a valid fact that will never become true in any reachable state.
+answer: A subset of facts that are known to be unreachable (not an exhaustive set).
+        It is empty only when we *know* that there are no such facts.
+"""
+
+
+class ReachabilityEvaluator(BaseEvaluator):
+    def get_score(self, ans, doc):
+        real_answer = doc["answer"]
+        real_answer = [f"({x.strip().lower()})" for x in real_answer]
+
+        if len(real_answer) == 0:
+            # The correct answer is None
+            self.add_scores(
+                ["none" == x.strip().lower() if x is not None else False for x in ans]
+            )
+        else:
+            for x in ans:
+                if x is None:
+                    self.scores.append(False)
+                elif x.strip().lower() in real_answer:
+                    # The answer is in the subset of stored correct answers
+                    self.scores.append(True)
+                else:
+                    # Need to run a planner on a task with the answer fact as the new goal
+                    atom = x.strip().lower()
+                    self.scores.append(
+                        is_unsolvable_new_goal(
+                            doc["PDDL_domain"].lower(),
+                            doc["PDDL_problem"].lower(),
+                            atom,
+                        )
+                    )
+
+        return self.get_avg_score()
+
+
+"""
+Validation task: generate an index of the first inapplicable action in the given sequence.
+answer: the correct index.
+"""
+
+
+class ValidationEvaluator(BaseEvaluator):
+    def get_score(self, ans, doc):
+        real_answer = str(doc["answer"])
+        assert int(real_answer) >= 0, (
+            f"The index must be non-negative, received {real_answer}"
+        )
+        # Exact match
+        self.add_scores(
+            [
+                real_answer.lower() == x.strip().lower() if x is not None else False
+                for x in ans
+            ]
+        )
+
+        return self.get_avg_score()
+
+
+##############################################################################
+
+
+def dump_item(item, **kwargs):
+    return json.dumps(item)
+
+
+def parse_prediction(prediction):
+    try:
+        ans = json.loads(prediction.strip())
+        response = ans.get("answer", None)
+        return response
+    except Exception as e:
+        print(f"Exception occurred {e}")
+        return prediction
+
+
+@register_filter("ACP_grammar_filter")
+class ACPGrammarFilter(RegexFilter):
+    """Filtering Index using"""
+
+    def __init__(self, *args, **kwargs):
+        self.parser = ACPGrammarParser(kwargs["grammar_task"])
+        self.clean = kwargs["clean"] if "clean" in kwargs else None
+
+    def clean_pos_neg(self, resp):
+        # Check for Positive Effects and Negative Effects instead of separation
+        if check_prog_response(resp):
+            resp2 = resp.lower()
+            resp2 = resp2.replace("*", "")
+            resp2 = resp2.replace("positive effects", "[")
+            resp2 = resp2.replace("negative effects", "] [")
+            resp2 = resp2 + "]"
+            return resp2
+        return resp
+
+    def clean_simplified_plan(self, resp):
+        # Check for "simplified plan:"
+        if "simplified plan:" in resp.lower():
+            resp2 = resp.lower()
+            resp2 = resp2.replace("*", "")
+            resp2 = resp2.split("simplified plan:")[1]
+            return resp2
+        return resp
+
+    def apply(self, resps, docs):
+        if self.clean == "pos_neg":
+            filtered_resps = [
+                [self.parser.parse(self.clean_pos_neg(r)) for r in resp]
+                for resp in resps
+            ]
+        elif self.clean == "simplified plan":
+            filtered_resps = [
+                [self.parser.parse(self.clean_simplified_plan(r)) for r in resp]
+                for resp in resps
+            ]
+        else:
+            filtered_resps = [[self.parser.parse(r) for r in resp] for resp in resps]
+        return filtered_resps
+
+
+def process_acp_results(doc, results):
+    return {"score": get_evaluator(doc["group"]).get_score(results, doc)}
+
+
+def get_score(references, predictions, **kwargs):
+    # print(f"References: {references}")
+    # print(f"Predictions: {predictions}")
+    data = json.loads(references[0].strip())
+    real_ans = data["answer"]
+    task = data["group"]
+
+    responses = [parse_prediction(prediction) for prediction in predictions]
+
+    print(f"Real answer: {real_ans}")
+    print(f"Model answers: {responses}")
+    parser = ACPGrammarParser(get_grammar_task(task))
+    ans = parse_ans(responses, parser, task)
+
+    print(f"Parsed model answers: {ans}")
+    score = get_evaluator(task).get_score(ans, data)
+
+    return {"get_score": score}
--- a/lm_eval/tasks/acpbench/gen_2shot/act_reach.yaml
+++ b/lm_eval/tasks/acpbench/gen_2shot/act_reach.yaml
+task: acp_areach_gen
+dataset_name: acp_areach_gen
+include: _gen_yaml_2shot
+fewshot_config:
+  sampler: first_n
+  samples:
+  - context: "A robot is in a grid and can only move to places that are connected to its current position. The grid size is 5x5, and the locations are of the form fi-jf (e.g., f3-2f or f0-1f). The grid cells are connected to their neighbors (e.g., f1-2f is connected to the four neighbors f0-2f, f2-2f, f1-1f, and f1-3f). Some positions on the grid are locked and can be opened with a key of a matching shape. The robot has an arm that can pick up a key when the key is in same location as the robot and the arm is empty.  There are 2 keys in 0 different shapes: Key key0-0 is of shape shape0, Key key0-1 is of shape shape0.  Currently, the robot is at position f2-2f and its arm is empty. All the positions are open except the following: f2-0f has shape0 shaped lock, f4-2f has shape0 shaped lock. Key key0-0 is at position f1-2f. Key key0-1 is at position f1-3f. The available actions are: (unlock ?curpos ?lockpos ?key ?shape) - unlock the place ?lockpos with key ?key of shape ?shape from the current position place ?curpos, (move ?curpos ?nextpos) - move from place ?curpos to place ?nextpos, (pickup ?curpos ?key) - retrieve the key ?key from its current position ?curpos, (pickup-and-loose ?curpos ?newkey ?oldkey) - pick up the key ?newkey from the current position ?curpos and loose the key ?oldkey which is being held, and (putdown ?curpos ?key) - put the key ?key at the current position place ?curpos."
+    question: "What action can never become applicable, in any state reachable from the current state?"
+    answer: "(unlock f0-3f f0-4f key0-0 shape0)"
+  - context: "There are several cities, each containing several locations, some of which are airports. There are also trucks, which can drive within a single city, and airplanes, which can fly between airports. The goal is to get some packages from various locations to various new locations. There are 2 trucks and 1 airplane, as well as 4 packages. There are 4 locations across 2 cities. The locations are in cities as follows: l1-1 and l1-0 are in c1; l0-0 and l0-1 are in c0. Currently, t1 is at l1-1, a0 is at l1-0, p0 is at l0-0, t0 is at l0-1, p2 is in a0, p1 is in t1, p3 is in t0. The available actions are: (load-truck ?obj ?truck ?loc) - load the object ?obj from location ?loc into the truck ?truck, (load-airplane ?obj ?airplane ?loc) - load object ?obj into airplane ?airplane at location ?loc, (unload-truck ?obj ?truck ?loc) - offload the object ?obj from the truck ?truck at location ?loc, (unload-airplane ?obj ?airplane ?loc) - offload the object ?obj from the airplane ?airplane at location ?loc, (drive-truck ?truck ?loc-from ?loc-to ?city) - navigate the truck ?truck from location ?loc-from in city ?city to location ?loc-to in the same city, and (fly-airplane ?airplane ?loc-from ?loc-to) - operate the airplane ?airplane from airport ?loc-from to airport ?loc-to."
+    question: "What action can never become applicable, in any state reachable from the current state?"
+    answer: "(drive-truck t0 l1-1 l0-0 c0)"
+doc_to_text: "**Question**: {{context}} {{question}} Each action starts with an opening parenthesis and ends with closing parenthesis. Provide one action or None. **Final Answer**:"
+filter_list:
+  - name: "acp_grammar_parse"
+    filter:
+      - function: "ACP_grammar_filter"
+        grammar_task: "act"
+      - function: "take_first"
--- a/lm_eval/tasks/acpbench/gen_2shot/app.yaml
+++ b/lm_eval/tasks/acpbench/gen_2shot/app.yaml
+task: acp_app_gen
+dataset_name: acp_app_gen
+include: _gen_yaml_2shot
+fewshot_config:
+  sampler: first_n
+  samples:
+  - context: "A robot is in a grid and can only move to places that are connected to its current position. The grid size is 5x5, and the locations are of the form fi-jf (e.g., f3-2f or f0-1f). The grid cells are connected to their neighbors (e.g., f1-2f is connected to the four neighbors f0-2f, f2-2f, f1-1f, and f1-3f). Some positions on the grid are locked and can be opened with a key of a matching shape. The robot has an arm that can pick up a key when the key is in same location as the robot and the arm is empty.  There are 2 keys in 1 different shapes: Key key0-1 is of shape shape0, Key key0-0 is of shape shape0.  Currently, the robot is at position f3-2f and its arm is empty. All the positions are open except the following: f2-0f has shape0 shaped lock, f4-2f has shape0 shaped lock. Key key0-0 is at position f2-2f. Key key0-1 is at position f1-3f. The available actions are: (unlock ?curpos ?lockpos ?key ?shape) - unlock the place ?lockpos with key ?key of shape ?shape from the current position place ?curpos, (move ?curpos ?nextpos) - travel from the current position ?curpos to the next position ?nextpos, (pickup ?curpos ?key) - pick up key ?key from place ?curpos, (pickup-and-loose ?curpos ?newkey ?oldkey) - pick up the key ?newkey at the current position place ?curpos and loose the key ?oldkey being held, and (putdown ?curpos ?key) - put down key ?key at current position place ?curpos."
+    question: "Generate the list of all ground actions that are applicable in this state."
+    answer: "[(move f3-2f f3-1f), (move f3-2f f2-2f), (move f3-2f f3-3f)]"
+  - context: "There are several cities, each containing several locations, some of which are airports. There are also trucks, which can drive within a single city, and airplanes, which can fly between airports. The goal is to get some packages from various locations to various new locations. There are 2 trucks and 1 airplane, as well as 4 packages. There are 4 locations across 2 cities. The locations are in cities as follows: l0-0 and l0-1 are in c0; l1-0 and l1-1 are in c1. Currently, t1, p2, and p3 are at l1-0, a0 is at l0-0, t0 is at l0-1, p1 and p0 are in t1. The available actions are: (load-truck ?obj ?truck ?loc) - load object ?obj into truck ?truck at location ?loc, (load-airplane ?obj ?airplane ?loc) - load object ?obj into airplane ?airplane at location ?loc, (unload-truck ?obj ?truck ?loc) - unload object ?obj from truck ?truck at location ?loc, (unload-airplane ?obj ?airplane ?loc) - remove the object ?obj from the airplane ?airplane and place it on the location ?loc, (drive-truck ?truck ?loc-from ?loc-to ?city) - navigate the truck ?truck from its current location ?loc-from in city ?city to the new location ?loc-to within the same city, and (fly-airplane ?airplane ?loc-from ?loc-to) - fly airplane ?airplane from airport ?loc-from to airport ?loc-to."
+    question: "Generate the list of all ground actions that are applicable in this state."
+    answer: "[(drive-truck t1 l1-0 l1-0 c1), (drive-truck t0 l0-1 l0-0 c0), (load-truck p2 t1 l1-0), (unload-truck p0 t1 l1-0), (drive-truck t0 l0-1 l0-1 c0), (fly-airplane a0 l0-0 l1-0), (fly-airplane a0 l0-0 l0-0), (unload-truck p1 t1 l1-0), (drive-truck t1 l1-0 l1-1 c1), (load-truck p3 t1 l1-0)]"
+doc_to_text: "**Question**: {{context}} {{question}} Each action starts with an opening parenthesis and ends with closing parenthesis. Provide only the actions. **Final Answer**:"
+filter_list:
+  - name: "acp_grammar_parse"
+    filter:
+      - function: "ACP_grammar_filter"
+        grammar_task: "action_list"
+      - function: "take_first"
--- a/lm_eval/tasks/acpbench/gen_2shot/just.yaml
+++ b/lm_eval/tasks/acpbench/gen_2shot/just.yaml
+task: acp_just_gen
+dataset_name: acp_just_gen
+include: _gen_yaml_2shot
+fewshot_config:
+  sampler: first_n
+  samples:
+  - context: "A robot is in a grid and can only move to places that are connected to its current position. The grid size is 5x5, and the locations are of the form fi-jf (e.g., f3-2f or f0-1f). The grid cells are connected to their neighbors (e.g., f1-2f is connected to the four neighbors f0-2f, f2-2f, f1-1f, and f1-3f). Some positions on the grid are locked and can be opened with a key of a matching shape. The robot has an arm that can pick up a key when the key is in same location as the robot and the arm is empty.  There are 2 keys in 1 different shapes: Key key0-0 is of shape shape0, Key key0-1 is of shape shape0.  Currently, the robot is at position f3-3f and its arm is empty. All the positions are open except the following: f4-2f has shape0 shaped lock, f2-0f has shape0 shaped lock. Key key0-0 is at position f2-2f. Key key0-1 is at position f1-3f. The available actions are: (unlock ?curpos ?lockpos ?key ?shape) - unlock place ?lockpos with key ?key of shape ?shape from current position place ?curpos, (move ?curpos ?nextpos) - move from ?curpos to ?nextpos, (pickup ?curpos ?key) - retrieve the key ?key from its current position ?curpos, (pickup-and-loose ?curpos ?newkey ?oldkey) - pick up key ?newkey at current position place ?curpos and loose key ?oldkey being held, and (putdown ?curpos ?key) - put down the key ?key at the current position ?curpos. The goal is to reach a state where the following facts hold: Key key0-0 is at f2-0f location and Key key0-1 is at f1-3f location."
+    question: "Simplify the plan [(move f3-3f f3-2f), (move f3-2f f2-2f), (pickup f2-2f key0-0), (move f2-2f f2-1f), (putdown f2-1f key0-0), (pickup f2-1f key0-0), (unlock f2-1f f2-0f key0-0 shape0), (move f2-1f f2-0f), (putdown f2-0f key0-0)] by removing either a single action or a pair of consecutive actions, while still maintaining a valid plan. Provide the resulting simplified plan."
+    answer: "[(move f3-3f f3-2f), (move f3-2f f2-2f), (pickup f2-2f key0-0), (move f2-2f f2-1f), (unlock f2-1f f2-0f key0-0 shape0), (move f2-1f f2-0f), (putdown f2-0f key0-0)]"
+  - context: "There are several cities, each containing several locations, some of which are airports. There are also trucks, which can drive within a single city, and airplanes, which can fly between airports. The goal is to get some packages from various locations to various new locations. There are 2 trucks and 1 airplane, as well as 4 packages. There are 4 locations across 2 cities. The locations are in cities as follows: l1-1 and l1-0 are in c1; l0-0 and l0-1 are in c0. Currently, p2, p1, and p3 are at l1-0, p0 and t1 are at l1-1, t0 is at l0-1, a0 is at l0-0. The available actions are: (load-truck ?obj ?truck ?loc) - load the object ?obj from location ?loc into the truck ?truck, (load-airplane ?obj ?airplane ?loc) - load the object ?obj from location ?loc onto the airplane ?airplane, (unload-truck ?obj ?truck ?loc) - unload the object ?obj from the truck ?truck at location ?loc, (unload-airplane ?obj ?airplane ?loc) - remove the object ?obj from the airplane ?airplane and place it on the location ?loc, (drive-truck ?truck ?loc-from ?loc-to ?city) - drive truck ?truck from location ?loc-from in city ?city to location ?loc-to in the same city, and (fly-airplane ?airplane ?loc-from ?loc-to) - fly the airplane ?airplane from location ?loc-from to location ?loc-to. The goal is to reach a state where the following facts hold: p3 is at l0-1, p2 is at l1-0, p0 is at l0-0, and p1 is at l1-0."
+    question: "Simplify the plan [(fly-airplane a0 l0-0 l1-0), (fly-airplane a0 l1-0 l0-0), (load-truck p0 t1 l1-1), (drive-truck t1 l1-1 l1-0 c1), (unload-truck p0 t1 l1-0), (fly-airplane a0 l0-0 l1-0), (load-airplane p0 a0 l1-0), (load-airplane p3 a0 l1-0), (fly-airplane a0 l1-0 l0-0), (unload-airplane p0 a0 l0-0), (unload-airplane p3 a0 l0-0), (drive-truck t0 l0-1 l0-0 c0), (load-truck p3 t0 l0-0), (drive-truck t0 l0-0 l0-1 c0), (unload-truck p3 t0 l0-1)] by removing either a single action or a pair of consecutive actions, while still maintaining a valid plan. Provide the resulting simplified plan."
+    answer: "[(load-truck p0 t1 l1-1), (drive-truck t1 l1-1 l1-0 c1), (unload-truck p0 t1 l1-0), (fly-airplane a0 l0-0 l1-0), (load-airplane p0 a0 l1-0), (load-airplane p3 a0 l1-0), (fly-airplane a0 l1-0 l0-0), (unload-airplane p0 a0 l0-0), (unload-airplane p3 a0 l0-0), (drive-truck t0 l0-1 l0-0 c0), (load-truck p3 t0 l0-0), (drive-truck t0 l0-0 l0-1 c0), (unload-truck p3 t0 l0-1)]"
+doc_to_text: "**Question**: {{context}} {{question}} **Final Answer**:"
+filter_list:
+  - name: "acp_grammar_parse"
+    filter:
+      - function: "ACP_grammar_filter"
+        grammar_task: "action_list"
+        clean: "simplified plan"
+      - function: "take_first"
--- a/lm_eval/tasks/acpbench/gen_2shot/land.yaml
+++ b/lm_eval/tasks/acpbench/gen_2shot/land.yaml
+task: acp_land_gen
+dataset_name: acp_land_gen
+include: _gen_yaml_2shot
+fewshot_config:
+  sampler: first_n
+  samples:
+  - context: "A robot is in a grid and can only move to places that are connected to its current position. The grid size is 5x5, and the locations are of the form fi-jf (e.g., f3-2f or f0-1f). The grid cells are connected to their neighbors (e.g., f1-2f is connected to the four neighbors f0-2f, f2-2f, f1-1f, and f1-3f). Some positions on the grid are locked and can be opened with a key of a matching shape. The robot has an arm that can pick up a key when the key is in same location as the robot and the arm is empty.  There are 2 keys in 1 different shapes: Key key0-1 is of shape shape0, Key key0-0 is of shape shape0.  Currently, the robot is at position f3-0f and its arm is empty. All the positions are open except the following: f4-2f has shape0 shaped lock. Key key0-0 is at position f3-0f. Key key0-1 is at position f1-3f. The goal is to reach a state where the following facts hold: Key key0-0 is at f2-0f location and Key key0-1 is at f1-3f location. The available propositions are: (at ?r ?x) - Key ?r is at ?x location, (at-robot ?x) - Robot is at ?x location, (locked ?x) - Location ?x is locked, (holding ?k) - Robot is holding ?k, (open ?x) - Location ?x is open, and (arm-empty) - Robot's arm is empty."
+    question: "Generate a non-trivial fact landmark, one that does not hold in the initial state or goal."
+    answer: "(holding key0-0)"
+  - context: "There are several cities, each containing several locations, some of which are airports. There are also trucks, which can drive within a single city, and airplanes, which can fly between airports. The goal is to get some packages from various locations to various new locations. There are 2 trucks and 1 airplane, as well as 4 packages. There are 4 locations across 2 cities. The locations are in cities as follows: l1-0 and l1-1 are in c1; l0-1 and l0-0 are in c0. Currently, a0 and p2 are at l1-0, t0 is at l0-0, t1 is at l1-1, p3 and p1 are in a0, p0 is in t1. The goal is to reach a state where the following facts hold: p0 is at l0-0, p2 is at l1-0, p1 is at l1-0, and p3 is at l0-1. The available propositions are: (at ?obj ?loc) - ?obj is at ?loc and (in ?obj1 ?obj2) - ?obj1 is in ?obj2."
+    question: "Generate a non-trivial fact landmark, one that does not hold in the initial state or goal."
+    answer: "(in p3 t0)"
+doc_to_text: "**Question**: {{context}} {{question}} Provide only the ground proposition or None. **Final Answer**:"
+filter_list:
+  - name: "acp_grammar_parse"
+    filter:
+      - function: "ACP_grammar_filter"
+        grammar_task: "act"
+      - function: "take_first"
--- a/lm_eval/tasks/acpbench/gen_2shot/next_act.yaml
+++ b/lm_eval/tasks/acpbench/gen_2shot/next_act.yaml
+task: acp_nexta_gen
+dataset_name: acp_nexta_gen
+include: _gen_yaml_2shot
+fewshot_config:
+  sampler: first_n
+  samples:
+  - context: "A robot is in a grid and can only move to places that are connected to its current position. The grid size is 5x5, and the locations are of the form fi-jf (e.g., f3-2f or f0-1f). The grid cells are connected to their neighbors (e.g., f1-2f is connected to the four neighbors f0-2f, f2-2f, f1-1f, and f1-3f). Some positions on the grid are locked and can be opened with a key of a matching shape. The robot has an arm that can pick up a key when the key is in same location as the robot and the arm is empty. There are 2 keys in 1 different shapes: Key key0-1 is of shape shape0, Key key0-0 is of shape shape0. Currently, the robot is at position f4-0f and its arm is empty. All the positions are open except the following: f4-2f has shape0 shaped lock. Key key0-0 is at position f3-0f. Key key0-1 is at position f1-3f. The goal is to reach a state where the following facts hold: Key key0-0 is at f2-0f location and Key key0-1 is at f1-3f location. The available actions are: (unlock ?curpos ?lockpos ?key ?shape) - unlock place ?lockpos with key ?key of shape ?shape from current position place ?curpos, (move ?curpos ?nextpos) - travel from the current position ?curpos to the next position ?nextpos, (pickup ?curpos ?key) - pick up key ?key from place ?curpos, (pickup-and-loose ?curpos ?newkey ?oldkey) - pick up the key ?newkey at the current position place ?curpos and loose the key ?oldkey being held, and (putdown ?curpos ?key) - put down the key ?key at the current position ?curpos."
+    question: "What is the next action that takes us towards the goal?"
+    answer: "(move f4-0f f3-0f)"
+  - context: "There are several cities, each containing several locations, some of which are airports. There are also trucks, which can drive within a single city, and airplanes, which can fly between airports. The goal is to get some packages from various locations to various new locations. There are 2 trucks and 1 airplane, as well as 4 packages. There are 4 locations across 2 cities. The locations are in cities as follows: l0-1 and l0-0 are in c0; l1-1 and l1-0 are in c1. Currently, t0 is at l0-1, a0 is at l0-0, t1 and p1 are at l1-0, p2, p0, and p3 are in t1. The goal is to reach a state where the following facts hold: p3 is at l0-1, p2 is at l1-0, p1 is at l1-0, and p0 is at l0-0. The available actions are: (load-truck ?obj ?truck ?loc) - load object ?obj into truck ?truck at location ?loc, (load-airplane ?obj ?airplane ?loc) - load the object ?obj from location ?loc onto the airplane ?airplane, (unload-truck ?obj ?truck ?loc) - unload the object ?obj from the truck ?truck at location ?loc, (unload-airplane ?obj ?airplane ?loc) - unload object ?obj from airplane ?airplane at location ?loc, (drive-truck ?truck ?loc-from ?loc-to ?city) - drive the truck ?truck in city ?city from location ?loc-from to location ?loc-to, and (fly-airplane ?airplane ?loc-from ?loc-to) - operate the airplane ?airplane from airport ?loc-from to airport ?loc-to."
+    question: "What is the next action that takes us towards the goal?"
+    answer: "(drive-truck t0 l0-1 l0-0 c0)"
+doc_to_text: "**Question**: {{context}} {{question}} Each action starts with an opening parenthesis and ends with closing parenthesis. Provide only the action. **Final Answer**:"
+filter_list:
+  - name: "acp_grammar_parse"
+    filter:
+      - function: "ACP_grammar_filter"
+        grammar_task: "action_name"
+      - function: "take_first"
--- a/lm_eval/tasks/acpbench/gen_2shot/prog.yaml
+++ b/lm_eval/tasks/acpbench/gen_2shot/prog.yaml
+task: acp_prog_gen
+dataset_name: acp_prog_gen
+include: _gen_yaml_2shot
+fewshot_config:
+  sampler: first_n
+  samples:
+  - context: "A robot is in a grid and can only move to places that are connected to its current position. \nThe grid size is 5x5, and the locations are of the form fi-jf (e.g., f3-2f or f0-1f). The grid cells are connected to their neighbors (e.g., f1-2f is connected to the four neighbors f0-2f, f2-2f, f1-1f, and f1-3f). Some positions on the grid are locked and can be opened with a key of a matching shape. The robot has an arm that can pick up a key when the key is in same location as the robot and the arm is empty.  \nThere are 2 keys in 0 different shapes: Key key0-0 is of shape shape0, Key key0-1 is of shape shape0.  \nCurrently, the robot is at position f0-1f and its arm is empty. All the positions are open except the following: f4-2f has shape0 shaped lock. Key key0-1 is at position f1-3f. Key key0-0 is at position f0-1f. The available propositions are: (at ?r ?x) - Key ?r is at ?x location, (at-robot ?x) - Robot is at ?x location, (locked ?x) - Location ?x is locked, (holding ?k) - Robot is holding ?k, (open ?x) - Location ?x is open, and (arm-empty) - Robot's arm is empty."
+    question: "Break down the outcomes of performing the action \"retrieve the key key0-0 from its current position f0-1f\" into two lists, positive effects and negative effects. Positive effects are the propositions that are false in the current state but will become true after performing the action. Negative effects are the propositions that are true in the current state and will become false after performing the action."
+    answer: "[(holding key0-0)] [(arm-empty), (at key0-0 f0-1f)]"
+  - context: "There are several cities, each containing several locations, some of which are airports. There are also trucks, which can drive within a single city, and airplanes, which can fly between airports. The goal is to get some packages from various locations to various new locations. There are 2 trucks and 1 airplane, as well as 4 packages. There are 4 locations across 2 cities. The locations are in cities as follows: l1-1 and l1-0 are in c1; l0-1 and l0-0 are in c0. Currently, p2, t1, p1, p3, a0, and p0 are at l1-0, t0 is at l0-1. The available propositions are: (at ?obj ?loc) - ?obj is at ?loc and (in ?obj1 ?obj2) - ?obj1 is in ?obj2."
+    question: "Break down the outcomes of performing the action \"load object p3 into truck t1 at location l1-0\" into two lists, positive effects and negative effects. Positive effects are the propositions that are false in the current state but will become true after performing the action. Negative effects are the propositions that are true in the current state and will become false after performing the action."
+    answer: "[(in p3 t1)] [(at p3 l1-0)]"
+doc_to_text: "**Question**: {{context}} {{question}} Provide only the two lists with the ground propositions. **Final Answer**:"
+filter_list:
+  - name: "acp_grammar_parse"
+    filter:
+      - function: "ACP_grammar_filter"
+        grammar_task: "progression_list"
+        clean: "pos_neg"
+      - function: "take_first"
--- a/lm_eval/tasks/acpbench/gen_2shot/reach.yaml
+++ b/lm_eval/tasks/acpbench/gen_2shot/reach.yaml
+task: acp_reach_gen
+dataset_name: acp_reach_gen
+include: _gen_yaml_2shot
+fewshot_config:
+  sampler: first_n
+  samples:
+  - context: "A robot is in a grid and can only move to places that are connected to its current position. The grid size is 5x5, and the locations are of the form fi-jf (e.g., f3-2f or f0-1f). The grid cells are connected to their neighbors (e.g., f1-2f is connected to the four neighbors f0-2f, f2-2f, f1-1f, and f1-3f). Some positions on the grid are locked and can be opened with a key of a matching shape. The robot has an arm that can pick up a key when the key is in same location as the robot and the arm is empty.  There are 2 keys in 0 different shapes: Key key0-1 is of shape shape0, Key key0-0 is of shape shape0.  Currently, the robot is at position f1-2f and its arm is empty. All the positions are open except the following: f4-2f has shape0 shaped lock. Key key0-0 is at position f1-0f. Key key0-1 is at position f1-3f. The available propositions are: (at ?r ?x) - Key ?r is at ?x location, (at-robot ?x) - Robot is at ?x location, (locked ?x) - Location ?x is locked, (holding ?k) - Robot is holding ?k, (open ?x) - Location ?x is open, and (arm-empty) - Robot's arm is empty."
+    question: "What proposition can never hold in any potentially reachable state?"
+    answer: "(locked f3-1f)"
+  - context: "There are several cities, each containing several locations, some of which are airports. There are also trucks, which can drive within a single city, and airplanes, which can fly between airports. The goal is to get some packages from various locations to various new locations. There are 2 trucks and 1 airplane, as well as 4 packages. There are 4 locations across 2 cities. The locations are in cities as follows: l0-0 and l0-1 are in c0; l1-0 and l1-1 are in c1. Currently, a0, p2, and t1 are at l1-0, p3 and p0 are at l0-0, t0 is at l0-1, p1 is in t1. The available propositions are: (at ?obj ?loc) - ?obj is at ?loc and (in ?obj1 ?obj2) - ?obj1 is in ?obj2."
+    question: "What proposition can never hold in any potentially reachable state?"
+    answer: "(at t0 l1-1)"
+doc_to_text: "**Question**: {{context}} {{question}} Provide one proposition or None. **Final Answer**:"
+filter_list:
+  - name: "acp_grammar_parse"
+    filter:
+      - function: "ACP_grammar_filter"
+        grammar_task: "act"
+      - function: "take_first"
--- a/lm_eval/tasks/acpbench/gen_2shot/val.yaml
+++ b/lm_eval/tasks/acpbench/gen_2shot/val.yaml
+task: acp_val_gen
+dataset_name: acp_val_gen
+include: _gen_yaml_2shot
+fewshot_config:
+  sampler: first_n
+  samples:
+  - context: "A robot is in a grid and can only move to places that are connected to its current position. The grid size is 5x5, and the locations are of the form fi-jf (e.g., f3-2f or f0-1f). The grid cells are connected to their neighbors (e.g., f1-2f is connected to the four neighbors f0-2f, f2-2f, f1-1f, and f1-3f). Some positions on the grid are locked and can be opened with a key of a matching shape. The robot has an arm that can pick up a key when the key is in same location as the robot and the arm is empty.  There are 2 keys in 1 different shapes: Key key0-0 is of shape shape0, Key key0-1 is of shape shape0.  Currently, the robot is at position f3-3f and its arm is empty. All the positions are open except the following: f2-0f has shape0 shaped lock, f4-2f has shape0 shaped lock. Key key0-1 is at position f1-3f. Key key0-0 is at position f2-2f. The goal is to reach a state where the following facts hold: Key key0-0 is at f2-0f location and Key key0-1 is at f1-3f location. The available actions are: (unlock ?curpos ?lockpos ?key ?shape) - unlock the place ?lockpos with the key ?key of the shape ?shape from the current position place ?curpos, (move ?curpos ?nextpos) - travel from the current position ?curpos to the next position ?nextpos, (pickup ?curpos ?key) - pick up key ?key from place ?curpos, (pickup-and-loose ?curpos ?newkey ?oldkey) - pick up the key ?newkey from the current position ?curpos and loose the key ?oldkey which is being held, and (putdown ?curpos ?key) - put down key ?key at current position place ?curpos."
+    question: "What is the first inapplicable action in the next sequence of actions: [(move f3-3f f3-2f), (move f3-2f f2-2f), (pickup f2-2f key0-0), (pickup-and-loose f4-0f key0-0 key0-1), (unlock f2-1f f2-0f key0-0 shape0), (move f2-1f f2-0f), (putdown f2-0f key0-0), (move f2-0f f2-1f)]?"
+    answer: "3"
+  - context: "There are several cities, each containing several locations, some of which are airports. There are also trucks, which can drive within a single city, and airplanes, which can fly between airports. The goal is to get some packages from various locations to various new locations. There are 2 trucks and 1 airplane, as well as 4 packages. There are 4 locations across 2 cities. The locations are in cities as follows: l0-1 and l0-0 are in c0; l1-1 and l1-0 are in c1. Currently, t1 and p0 are at l1-1, t0 is at l0-1, p3, p2, and p1 are at l1-0, a0 is at l0-0. The goal is to reach a state where the following facts hold: p2 is at l1-0, p3 is at l0-1, p0 is at l0-0, and p1 is at l1-0. The available actions are: (load-truck ?obj ?truck ?loc) - load object ?obj into truck ?truck at location ?loc, (load-airplane ?obj ?airplane ?loc) - load the object ?obj from location ?loc onto the airplane ?airplane, (unload-truck ?obj ?truck ?loc) - unload the object ?obj from the truck ?truck at location ?loc, (unload-airplane ?obj ?airplane ?loc) - unload object ?obj from airplane ?airplane at location ?loc, (drive-truck ?truck ?loc-from ?loc-to ?city) - navigate the truck ?truck from its current location ?loc-from in city ?city to the new location ?loc-to within the same city, and (fly-airplane ?airplane ?loc-from ?loc-to) - fly the airplane ?airplane from location ?loc-from to location ?loc-to."
+    question: "What is the first inapplicable action in the next sequence of actions: [(load-truck p0 t1 l1-1), (drive-truck t1 l1-1 l1-0 c1), (unload-truck p0 t1 l1-0), (fly-airplane a0 l0-0 l1-0), (unload-truck p3 t0 l0-1), (load-airplane p3 a0 l1-0), (fly-airplane a0 l1-0 l0-0), (unload-airplane p0 a0 l0-0), (unload-airplane p3 a0 l0-0), (drive-truck t0 l0-1 l0-0 c0), (load-truck p3 t0 l0-0), (drive-truck t0 l0-0 l0-1 c0), (unload-truck p3 t0 l0-1)]?"
+    answer: "4"
+doc_to_text: "**Question**: {{context}} {{question}} Provide only the index of the action. **Final Answer**:"
+filter_list:
+  - name: "acp_grammar_parse"
+    filter:
+      - function: "ACP_grammar_filter"
+        grammar_task: "index"
+      - function: "take_first"
--- a/lm_eval/tasks/acpbench/gen_2shot_with_pddl/_gen_yaml_2shot
+++ b/lm_eval/tasks/acpbench/gen_2shot_with_pddl/_gen_yaml_2shot
+tag:
+  - acp_gen_2shot_with_pddl
+  - acp_bench_hard_with_pddl
+dataset_path: ibm-research/acp_bench
+test_split: test
+description: "Answer the question based on the provided PDDL domain and PDDL problem. The current state is the initial state described in the PDDL problem below.\n\n"
+doc_to_target: "{{answer}}"
+output_type: generate_until
+num_fewshot: 2
+generation_kwargs:
+  until:
+    - "\n\n\n\n"
+    - "\n\n"
+    - "**Question**:"
+    - "**Question:**"
+    - "Q:"
+  do_sample: false
+  max_gen_toks: 1000
+  temperature: 0.0
+metadata:
+  version: 1.0
+process_results: !function acp_utils.process_acp_results
+metric_list:
+  - metric: "score"
+    aggregation: mean
+    higher_is_better: True
--- a/lm_eval/tasks/acpbench/gen_2shot_with_pddl/acp_grammar.lark
+++ b/lm_eval/tasks/acpbench/gen_2shot_with_pddl/acp_grammar.lark
+NAME: /[a-zA-Z][a-zA-Z0-9-_]*/
+LPAR : "("
+RPAR : ")"
+LSPAR: "["
+RSPAR: "]"
+COMMA: ","
+WS: /[ \n]/
+
+action_none : "None"
+
+action_name : LPAR NAME (WS NAME)* RPAR
+
+action_list : (action_name WS?)*
+
+prog_list :  action_name* (COMMA action_name)*
+
+progression_list : LSPAR prog_list RSPAR LSPAR prog_list RSPAR
+
+act : action_name | action_none
+
+index: /[0-9]+[0-9]*/
+
+start: action_list
--- a/lm_eval/tasks/acpbench/gen_2shot_with_pddl/acp_utils.py
+++ b/lm_eval/tasks/acpbench/gen_2shot_with_pddl/acp_utils.py
+import json
+import os
+from abc import ABC, abstractmethod
+from collections import defaultdict
+from pathlib import Path
+
+from lm_eval.api.registry import register_filter
+from lm_eval.filters.extraction import RegexFilter
+
+
+try:
+    import tempfile
+
+    import tarski
+    from kstar_planner import planners as kp
+    from lark import Lark
+    from lark.lexer import Token
+    from lark.visitors import Visitor
+    from pddl.core import Problem
+    from pddl.parser.domain import DomainParser
+    from pddl.parser.problem import ProblemParser
+    from tarski.grounding.common import StateVariableLite
+    from tarski.grounding.lp_grounding import LPGroundingStrategy
+    from tarski.io import PDDLReader
+    from tarski.io import fstrips as iofs
+    from tarski.syntax.formulas import is_atom
+    from tarski.syntax.transform.action_grounding import (
+        ground_schema_into_plain_operator_from_grounding,
+    )
+    from tarski.util import SymbolIndex
+except ModuleNotFoundError:
+    raise ModuleNotFoundError(
+        "`lark>=1.1.9`, `tarski[clingo]==0.8.2`, `pddl==0.4.2` and `kstar-planner==1.4.2` are required for evaluating the generative tasks. \
+Please install via pip install lm-eval[acpbench] or pip install -e .[acpbench]",
+    )
+
+
+#########################################################################
+# Grammar
+
+
+GRAMMAR_FILE = os.path.join(
+    os.path.dirname(os.path.abspath(__file__)), "acp_grammar.lark"
+)
+
+
+class ACPBench_Visitor(Visitor):
+    def __init__(self) -> None:
+        super().__init__()
+        self.action_lists = None
+        self.action_names = None
+        self.progression_lists = None
+        self.prog_lists = None
+        self.indexes = None
+
+    def action_list(self, tree):
+        self.action_lists = []
+
+    def prog_list(self, tree):
+        if self.prog_lists is not None:
+            self.progression_lists.append(self.prog_lists)
+        self.prog_lists = []
+
+    def progression_list(self, tree):
+        self.progression_lists = []
+
+    def action_none(self, tree):
+        self.action_names = "None"
+
+    def action_name(self, tree):
+        act_name = "(" + "".join(tree.children[1:-1]) + ")"
+        self.action_names = act_name
+        if self.action_lists is not None:
+            self.action_lists.append(act_name)
+        if self.prog_lists is not None:
+            self.prog_lists.append(act_name)
+
+    def index(self, tree):
+        self.indexes = "".join(tree.children)
+        if not self.indexes.isnumeric():
+            self.indexes = None
+
+
+class ACPGrammarParser(object):
+    def __init__(self, task) -> None:
+        self.task = task
+        with open(GRAMMAR_FILE) as f:
+            grammar = f.read()
+            self.acp_parser = Lark(grammar, start=task, parser="lalr")
+
+    def parse(self, input, debug=False):
+        def ignore_errors(e):
+            if hasattr(e, "token") and e.token.type == "$END":
+                for x in e.expected:
+                    if x != "WS":
+                        e.interactive_parser.feed_token(
+                            Token(x, self.acp_parser.get_terminal(x).pattern.value)
+                        )
+
+            return True
+
+        input = input.replace("\n", "")
+        input = input.strip()
+        try:
+            tree = self.acp_parser.parse(input, on_error=ignore_errors)
+
+            if debug:
+                print(tree)
+            visitor = ACPBench_Visitor()
+            visitor.visit_topdown(tree)
+            if self.task == "action_list":
+                return visitor.action_lists
+            elif self.task == "act":
+                return visitor.action_names
+            elif self.task == "action_name":
+                return visitor.action_names
+            elif self.task == "index":
+                return visitor.indexes
+            elif self.task == "progression_list":
+                if visitor.prog_lists not in visitor.progression_lists:
+                    visitor.progression_lists.append(visitor.prog_lists)
+                return visitor.progression_lists
+        except Exception as e:
+            if debug:
+                print("exception")
+                print(e)
+            return None
+
+
+##############################################################################
+# Utils
+
+
+# Used in next action
+def is_on_optimal_plan(domain, problem, action, opt):
+    with (
+        tempfile.NamedTemporaryFile() as domain_temp,
+        tempfile.NamedTemporaryFile() as problem_temp,
+    ):
+        with open(str(domain_temp.name), "w", encoding="utf8") as file:
+            file.write(domain.lower())
+        with open(str(problem_temp.name), "w", encoding="utf8") as file:
+            file.write(problem.lower())
+
+        # Here, we need to keep the temp files live until the end of the function
+        try:
+            P = STRIPS(str(domain_temp.name), str(problem_temp.name))
+        except Exception:
+            # Unsolvable
+            return False
+
+        a = P.get_action_or_none(action[1:-1])
+        if a is None:
+            return False
+        state = P.init
+        next_state = progress(state, a)
+        if opt is None:
+            # Get an optimal plan cost
+            plans = generate_optimal_plans_for_problem_state(
+                P, state, num_plans=1, timeout=5
+            )
+            opt = len(plans[0]["actions"])
+        else:
+            opt = int(opt)
+
+        # Getting an optimal plan for the next state
+        next_plans = generate_optimal_plans_for_problem_state(
+            P, next_state, num_plans=1, timeout=5
+        )
+        if next_plans is None:
+            return False
+        next_opt = len(next_plans[0]["actions"])
+        return next_opt + 1 == opt
+
+
+# Used in justification
+def is_plan(domain, problem, new_plan):
+    P = get_STRIPS(domain, problem)
+    if P is None:
+        # Unsolvable
+        return False
+
+    # Check if new_plan is a plan
+    current_state = P.init
+    for action in new_plan:
+        applicable_actions = P.get_applicable_actions(current_state)
+        app_actions_list = [f"({a.name.lower()})" for a in applicable_actions]
+        if action.lower() not in app_actions_list:
+            return False
+        a = applicable_actions[app_actions_list.index(action.lower())]
+        current_state = progress(current_state, a)
+    return entails(current_state, P.goal)
+
+
+# Used in action reachability
+def get_action_preconditions(domain, problem, action):
+    P = get_STRIPS(domain, problem)
+
+    assert P is not None, f"Domain\n{domain}\nProblem\n{problem}\nAction: {action}"
+    a = P.get_action_or_none(action[1:-1])
+    if a is None:
+        return a
+
+    return [f"({f})" for f in a.pres]
+
+
+def generate_optimal_plans_for_problem_state(P, state, num_plans, timeout):
+    import tempfile
+
+    with (
+        tempfile.NamedTemporaryFile() as domain_temp,
+        tempfile.NamedTemporaryFile() as problem_temp,
+    ):
+        create_tmp_dom_prob_replace_init(P, state, domain_temp, problem_temp)
+        plans = generate_top_q_plans(
+            domain=str(domain_temp.name),
+            problem=str(problem_temp.name),
+            num_plans=num_plans,
+            quality_bound=1.0,
+            timeout=timeout,
+        )
+        # print(plans)
+        if plans is None or len(plans["plans"]) == 0:
+            return None
+        return plans["plans"]
+
+
+def generate_top_q_plans(domain, problem, num_plans=10, quality_bound=1.0, timeout=30):
+    # print("Running K* planner")
+    plans = kp.plan_unordered_topq(
+        domain_file=Path(domain),
+        problem_file=Path(problem),
+        number_of_plans_bound=num_plans,
+        quality_bound=quality_bound,
+        timeout=timeout,
+    )
+    return plans
+
+
+# Used in (action) reachability
+def is_unsolvable_new_goal(domain, problem, new_goal):
+    goal = extract_goal(problem)
+    new_problem = problem.replace(goal, f"(:goal {new_goal} )")
+    return is_unsolvable(domain, new_problem)
+
+
+def is_unsolvable(domain, problem):
+    with (
+        tempfile.NamedTemporaryFile() as domain_temp,
+        tempfile.NamedTemporaryFile() as problem_temp,
+    ):
+        with open(str(domain_temp.name), "w", encoding="utf8") as file:
+            file.write(str(domain))
+        with open(str(problem_temp.name), "w", encoding="utf8") as file:
+            file.write(str(problem))
+
+        plans = kp.plan_unordered_topq(
+            domain_file=Path(str(domain_temp.name)),
+            problem_file=Path(str(problem_temp.name)),
+            quality_bound=1.0,
+            number_of_plans_bound=1,
+            timeout=3,
+        )
+
+        if len(plans["planner_error"]) > 0:
+            fl = plans["planner_error"].split("\n")[0]
+            print(f"Planner error: {fl}")
+            return False
+        if plans is None or len(plans["plans"]) == 0:
+            return plans["unsolvable"]
+        return False
+
+
+def extract_goal(prob):
+    a = prob.split("(:goal")[1]
+    cp = 1
+    for i, c in enumerate(a):
+        if c == ")":
+            cp -= 1
+        if c == "(":
+            cp += 1
+        if cp == 0:
+            return "(:goal" + a[: i + 1]
+
+    assert False
+
+
+def entails(state, partialstate):
+    return partialstate <= state
+
+
+def progress(state, act):
+    assert entails(state, act.pres), (
+        "Cannot progress with inconsistent state / action precondition:\n\t Action: "
+        + act.name
+        + "\n\t State: \n\t\t"
+        + "\n\t\t".join(state)
+    )
+    return (state - act.dels) | act.adds
+
+
+def regress(state, act):
+    assert len(state & act.dels) == 0, (
+        "Cannot regress with inconsistent state / action delete effect:\n\t Action: "
+        + act.name
+        + "\n\t State: \n\t\t"
+        + "\n\t\t".join(state)
+    )
+    return (state - act.adds) | act.pres
+
+
+def get_STRIPS(domain, problem):
+    with (
+        tempfile.NamedTemporaryFile() as domain_temp,
+        tempfile.NamedTemporaryFile() as problem_temp,
+    ):
+        with open(str(domain_temp.name), "w", encoding="utf8") as file:
+            file.write(domain.lower())
+        with open(str(problem_temp.name), "w", encoding="utf8") as file:
+            file.write(problem.lower())
+
+        try:
+            P = STRIPS(str(domain_temp.name), str(problem_temp.name))
+            return P
+        except Exception as e:
+            print(f"||{e}||")
+            return None
+
+
+def create_tmp_dom_prob_replace_init(P, state, result_domain_file, result_problem_file):
+    d, p = P.PDDL_replace_init_pddl_parser(state)
+    with open(str(result_domain_file.name), "w", encoding="utf8") as file:
+        file.write(str(d))
+    with open(str(result_problem_file.name), "w", encoding="utf8") as file:
+        file.write(str(p))
+
+    return d, p
+
+
+def fix_name(s):
+    # (act param)
+    if "(" == s[0] and ")" == s[-1]:
+        return s[1:-1]
+    # make it space separated
+    s = s.replace(", ", " ").replace(",", " ")
+    # act(param)
+    if "(" in s:
+        assert ")" == s[-1], f"Broken name? {s}"
+        s = s.replace("(", " ").replace(")", "")
+    # act param
+    return s
+
+
+def get_atoms_pddl(d, p, atoms):
+    objs = set()
+    preds = defaultdict(list)
+    for atom in atoms:
+        a = atom.lower().strip().split(" ")
+        args = a[1:]
+        preds[a[0]].append(args)
+        objs |= set(args)
+
+    constants = [o for o in p.objects | d.constants if o.name.lower() in objs]
+    constants_dict = {}
+    for c in constants:
+        constants_dict[c.name.lower()] = c
+    assert len(objs) == len(constants), (
+        f"Could not identify all objects: {objs - set(constants_dict.keys())} not found, {set(constants_dict.keys()) - objs} should not be there"
+    )
+
+    state = []
+    covered_preds = set()
+    for f in d.predicates:
+        name = f.name.lower()
+        if name in preds:
+            covered_preds.add(name)
+            assert len(preds[name][0]) == f.arity, (
+                f"The arity does not match: {preds[name]} vs {f.terms}"
+            )
+            # Going over the lists of objects, adding ground predicate for each
+            for ob in preds[name]:
+                c = [constants_dict[o] for o in ob]
+                state.append(f(*c))
+    assert len(covered_preds) == len(preds.keys()), (
+        f"Covered predicates: \n{sorted(list(covered_preds))} vs \n{sorted(list(preds.keys()))}"
+    )
+    return set(state)
+
+
+class Action:
+    def __init__(self, name, pre, add, delete):
+        self.name = name
+        self.pres = pre
+        self.adds = add
+        self.dels = delete
+
+    def __str__(self):
+        pres = "{" + ", ".join([f"({a})" for a in self.pres]) + "}"
+        adds = "{" + ", ".join([f"({a})" for a in self.adds]) + "}"
+        dels = "{" + ", ".join([f"({a})" for a in self.dels]) + "}"
+
+        return f"< {self.name}, {pres}, {adds}, {dels} >"
+
+    def toJSON(self):
+        return json.dumps(
+            {
+                "name": self.name,
+                "preconditions": [f"({a})" for a in self.pres],
+                "add_effects": [f"({a})" for a in self.adds],
+                "delete_effects": [f"({a})" for a in self.dels],
+            },
+            sort_keys=True,
+            indent=4,
+        )
+
+    def __repr__(self):
+        return self.name
+
+    def __eq__(self, action):
+        return self.name == action.name
+
+    def __hash__(self):
+        return hash(self.name)
+
+
+class STRIPS:
+    def __init__(self, domain, problem):
+        self.domain_file = domain
+        self.problem_file = problem
+        self.reader = PDDLReader(raise_on_error=True)
+        self.reader.parse_domain(domain)
+        self.problem = self.reader.parse_instance(problem)
+        (self.grounded_fluents, init, goal, self.operators, self.grounder) = (
+            self.ground_problem(self.problem)
+        )
+
+        self.fluents = set([fix_name(str(f)) for f in self.grounded_fluents])
+        self.fluents_map = dict()
+        for f in self.grounded_fluents:
+            self.fluents_map[fix_name(str(f))] = f
+        self.init = set([fix_name(str(f)) for f in init])
+        self.goal = set([fix_name(str(f)) for f in goal])
+        self.actions = set()
+        self.action_map = {}
+        self.init_fluents = [self.fluents_map[f] for f in self.init]
+
+        self.static_predicates = [i.name for i in self.grounder.static_symbols]
+        for op in self.operators:
+            act = self.operator_to_action(op)
+            self.actions.add(act)
+            self.action_map[act.name.lower()] = act
+
+    def __str__(self):
+        fluents = "P = {" + ", ".join([f"({a})" for a in self.fluents]) + "}"
+        init = "I = {" + ", ".join([f"({a})" for a in self.init]) + "}"
+        goal = "G = {" + ", ".join([f"({a})" for a in self.goal]) + "}"
+        actions = "A = {" + "\n ".join([a.__str__() for a in self.actions]) + "}"
+        return fluents + ",\n" + init + "\n" + goal + "\n" + actions
+
+    def toJSON(self):
+        actions = [a.toJSON() for a in self.actions]
+        return json.dumps(
+            {
+                "fluents": list(self.fluents),
+                "initial_state": list(self.init),
+                "goal": list(self.goal),
+                "actions": actions,
+            },
+            sort_keys=True,
+            indent=4,
+        )
+
+    def operator_to_action(self, op, check_fluents=True, check_static=False):
+        adds = {
+            fix_name(str(f.atom)) for f in op.effects if isinstance(f, iofs.AddEffect)
+        } & self.fluents
+        dels = {
+            fix_name(str(f.atom)) for f in op.effects if isinstance(f, iofs.DelEffect)
+        } & self.fluents
+        pre = self.fix_pre_name(op.precondition)
+        if check_fluents:
+            pre = pre & self.fluents
+        if check_static:
+            pre = {p for p in pre if p.split()[0] not in self.static_predicates}
+        act = Action(fix_name(str(op)), pre, adds, dels)
+        return act
+
+    def fix_pre_name(self, precondition):
+        if not is_atom(precondition):
+            return {fix_name(str(f)) for f in precondition.subformulas}
+        return {fix_name(str(precondition))}
+
+    def action(self, name):
+        return self.action_map[fix_name(name).lower()]
+
+    def get_action_or_none(self, name):
+        if "(" in name and ")" != name[-1]:
+            return None
+        return self.action_map.get(fix_name(name).lower(), None)
+
+    def fluent(self, name):
+        return fix_name(name)
+
+    def static_symbols(self):
+        return list(self.grounder.static_symbols)
+
+    def fluent_symbols(self):
+        return list(self.grounder.fluent_symbols)
+
+    def get_grounded_atoms(self, symbol):
+        variables = SymbolIndex()
+        lang = symbol.language
+        key = "atom_" + symbol.name
+        model = self.grounder._solve_lp()
+        if (
+            key in model
+        ):  # in case there is no reachable ground state variable from that fluent symbol
+            for binding in model[key]:
+                binding_with_constants = tuple(lang.get(c) for c in binding)
+                variables.add(StateVariableLite(symbol, binding_with_constants))
+        return variables
+
+    def get_applicable_actions(self, s):
+        return [a for a in self.actions if entails(s, a.pres)]
+
+    def ground_problem(self, problem):
+        grounder = LPGroundingStrategy(problem, include_variable_inequalities=True)
+        action_groundings = grounder.ground_actions()
+        operators = []
+        for action_name, groundings in action_groundings.items():
+            action = problem.get_action(action_name)
+            for grounding in groundings:
+                operators.append(
+                    ground_schema_into_plain_operator_from_grounding(action, grounding)
+                )
+
+        grounded_fluents = set(
+            [
+                grounded_fluent.to_atom()
+                for grounded_fluent in grounder.ground_state_variables().objects
+            ]
+        )
+        init = [f for f in problem.init.as_atoms() if f in grounded_fluents]
+        if isinstance(problem.goal, tarski.syntax.Atom):
+            goal = [problem.goal]
+        else:
+            goal = [f for f in problem.goal.subformulas if f in grounded_fluents]
+
+        return (grounded_fluents, init, goal, operators, grounder)
+
+    def get_static(self):
+        static_symbols = self.static_symbols()
+        ret = []
+        for symbol in static_symbols:
+            ret.extend(self.get_grounded_atoms(symbol))
+        return set([fix_name(str(x)) for x in ret])
+
+    def PDDL_replace_init_pddl_parser(self, s):
+        d = DomainParser()(open(self.domain_file, "r").read().lower())
+        p = ProblemParser()(open(self.problem_file, "r").read().lower())
+
+        new_state = get_atoms_pddl(d, p, s | self.get_static())
+
+        new_p = Problem(
+            p.name, domain=d, objects=p.objects, init=new_state, goal=p.goal
+        )
+
+        return d, new_p
+
+
+def parse_ans(response: str, parser: ACPGrammarParser, task: str):
+    return [parser.parse(clean_answer(resp, task)) for resp in response]
+
+
+# def parse_ans(response : str, parser : ACPGrammarParser, task : str):
+#     ans = [parser.parse(clean_answer(resp, task), debug=True) for resp in response]
+#     if any(elem is None for elem in ans) or any(elem is None for elem in ans[0]):
+#         return None
+#     return ans
+
+
+def remove_garbage(s):
+    while True:
+        if s.endswith("."):
+            s = s[:-1]
+        elif s.endswith("\n"):
+            s = s[:-2]
+        else:
+            break
+    return s.rstrip()
+
+
+def compare_str(s1, s2):
+    return remove_garbage(s1).lower() == remove_garbage(s2).lower()
+
+
+def compare(l1, l2):
+    if not isinstance(l1, list):
+        return compare_str(l1, l2)
+    if not isinstance(l2, list):
+        return False
+    for i, v in enumerate(l1):
+        if not compare(v, l2[i]):
+            return False
+    return True
+
+
+def check_prog_response(resp):
+    if (
+        "Positive Effects".lower() in resp.lower()
+        and "Negative Effects".lower() in resp.lower()
+    ):
+        if "[" not in resp:
+            return True
+    return False
+
+
+def clean_answer(resp, task):
+    # Minor cleanup
+    if "progression_gen" in task:
+        # Check for Positive Effects and Negative Effects instead of separation
+        if check_prog_response(resp):
+            # replace **Positive Effects** with "["
+            # replace **Negative Effects** with "] ["
+            # append "]" to the end
+            resp2 = resp.lower()
+            resp2 = resp2.replace("*", "")
+            resp2 = resp2.replace("positive effects", "[")
+            resp2 = resp2.replace("negative effects", "] [")
+            resp2 = resp2 + "]"
+            return resp2
+    if "action_justification_gen" in task:
+        # Check for "simplified plan:"
+        if "simplified plan:" in resp.lower():
+            resp2 = resp.lower()
+            resp2 = resp2.replace("*", "")
+            resp2 = resp2.split("simplified plan:")[1]
+            return resp2
+    return resp
+
+
+def get_grammar_task(task):
+    # print(task)
+    if task == "reachable_atom_gen":
+        return "act"
+    elif task == "progression_gen":
+        return "progression_list"
+    elif task == "validation_gen":
+        return "index"
+    elif task == "reachable_action_gen":
+        return "act"
+    elif task == "action_justification_gen":
+        return "action_list"
+    elif task == "landmarks_gen":
+        return "act"
+    elif task == "goal_closer_gen":
+        return "action_name"
+    elif task == "applicable_actions_gen":
+        return "action_list"
+
+
+##############################################################################
+#  Evaluators
+
+
+def fix_action_name(a):
+    assert a.startswith("(") and a.endswith(")")
+    return "(" + " ".join([x.strip() for x in a[1:-1].split(" ") if len(x) > 0]) + ")"
+
+
+def str_remove_before_first_parentheses(s):
+    if s.startswith("("):
+        return s
+    try:
+        return s[s.index("(") :]
+    except Exception:
+        return ""
+
+
+def str_remove_after_last_parentheses(s):
+    if s.endswith(")"):
+        return s
+
+    i = s.rfind(")")
+
+    if i == -1:
+        return ""
+    return s[: i + 1]
+
+
+def cleanup_answer(ans):
+    if isinstance(ans, str):
+        ans = str_remove_before_first_parentheses(ans)
+        ans = str_remove_after_last_parentheses(ans)
+        ans = ans.lower()
+        ans = (
+            ans.replace(")\n(", ")######(")
+            .replace("),(", ")######(")
+            .replace(") (", ")######(")
+            .split("######")
+        )
+        return ans
+    if isinstance(ans, list):
+        res = []
+        for x in ans:
+            res.extend(cleanup_answer(x))
+        return res
+
+
+def set_equal(ans1, ans2):
+    return set(ans1) == set(ans2)
+
+
+class BaseEvaluator(ABC):
+    def __init__(self) -> None:
+        self.scores = []
+
+    @abstractmethod
+    def get_score(self, ans, doc):
+        pass
+
+    def add_scores(self, scores):
+        self.scores.extend(scores)
+
+    def get_avg_score(self):
+        avg_score = sum(self.scores) / len(self.scores)
+        return avg_score
+
+
+def get_evaluator(group):
+    if group == "applicable_actions_gen":
+        return ApplicabilityEvaluator()
+    elif group == "progression_gen":
+        return ProgressionEvaluator()
+    elif group == "validation_gen":
+        return ValidationEvaluator()
+    elif group == "reachable_atom_gen":
+        return ReachabilityEvaluator()
+    elif group == "goal_closer_gen":
+        return NextActionEvaluator()
+    elif group == "action_justification_gen":
+        return JustificationEvaluator()
+    elif group == "landmarks_gen":
+        return LandmarksEvaluator()
+    elif group == "reachable_action_gen":
+        return ActionReachabilityEvaluator()
+    assert True, f"Group {group} not found"
+
+
+"""
+Action Reachability task: generate a valid action that is not applicable to any reachable state.
+answer: A subset of actions that are known to be unreachable (not an exhaustive set).
+        It is empty only when we *know* that there are no such actions.
+"""
+
+
+class ActionReachabilityEvaluator(BaseEvaluator):
+    def get_score(self, ans, doc):
+        real_answer = doc["answer"]
+        if not real_answer or len(real_answer) == 0:
+            # The correct answer is None
+            self.add_scores(
+                ["none" == x.strip().lower() if x is not None else False for x in ans]
+            )
+        else:
+            for x in ans:
+                if x is None:
+                    self.scores.append(False)
+                    continue
+                action = x.strip().lower()
+                if action in real_answer:
+                    # The answer is in the subset of stored correct answers
+                    self.scores.append(True)
+                    continue
+                prec = get_action_preconditions(
+                    doc["PDDL_domain"].lower(), doc["PDDL_problem"].lower(), action
+                )
+                if prec is None:
+                    # The answer does not correspond to a valid action
+                    self.scores.append(False)
+                else:
+                    # Need to run a planner on a task with the answer action preconditions as the new goal
+                    prec = f"(and {' '.join(prec)})"
+                    self.scores.append(
+                        is_unsolvable_new_goal(
+                            doc["PDDL_domain"].lower(),
+                            doc["PDDL_problem"].lower(),
+                            prec,
+                        )
+                    )
+
+        return self.get_avg_score()
+
+
+"""
+Action Applicability task: generate all actions that are applicable in the current state.
+answer: A set of all applicable actions.
+"""
+
+
+class ApplicabilityEvaluator(BaseEvaluator):
+    def get_score(self, ans, doc):
+        real_answer = doc["answer"]
+        real_answer = [a.lower() for a in real_answer]
+        ans = [[fix_action_name(a) for a in x] if x is not None else None for x in ans]
+
+        # Check if the answer is equal (as a set) to the real stored answer
+        self.add_scores(
+            [
+                set_equal(real_answer, cleanup_answer(x)) if x is not None else False
+                for x in ans
+            ]
+        )
+        return self.get_avg_score()
+
+
+def is_subsequence(plan, new_plan):
+    i = 0
+    for a in plan:
+        if a == new_plan[i]:
+            i += 1
+            if len(new_plan) == i:
+                # Done
+                return True
+    return False
+
+
+def is_subsequence_and_plan(domain, problem, plan, new_plan):
+    if len(plan) <= len(new_plan):
+        return False
+    if not is_subsequence(plan, new_plan):
+        return False
+    return is_plan(domain, problem, new_plan)
+
+
+"""
+Justification task: generate a proper subsequence of the given plan that is also a plan.
+answer: A list of examples of actions that can be removed (ignored in evaluation).
+"""
+
+
+class JustificationEvaluator(BaseEvaluator):
+    def get_score(self, ans, doc):
+        # Sequence of actions (plan) from the question
+        if "inputs" in doc:  # old field name
+            seq = doc["inputs"][19:-147]
+        else:
+            seq = doc["question"][19:-147]
+        seq = seq.replace(") (", ")######(").split("######")
+        for x in ans:
+            if x is None:
+                self.scores.append(False)
+                continue
+            # An answer plan candidate
+            x = [fix_action_name(a) for a in x]
+            if len(x) == 0:
+                # Wrong answer - never an empty sequence
+                self.scores.append(0)
+                continue
+            # Check if the plan candidate from the answer (a) is a proper subsequence of the plan in the question and (b) is a plan.
+            self.scores.append(
+                is_subsequence_and_plan(
+                    doc["PDDL_domain"].lower(), doc["PDDL_problem"].lower(), seq, x
+                )
+            )
+        return self.get_avg_score()
+
+
+"""
+Landmarks task: generate a fact that is a non-trivial landmark for the current state.
+answer: A list of facts that are found to be landmarks and a list of facts that are found to be non-landmarks.
+
+The questions are generated only for cases where all facts either
+    (a) hold in the current state,
+    (b) true in goal,
+    (c) are found to be landmarks, or
+    (d) are found to be non-landmarks.
+In such cases, the evaluation is simple, it does not require checking whether a fact is a landmark, it was
+already done during question generation.
+"""
+
+
+class LandmarksEvaluator(BaseEvaluator):
+    def get_score(self, ans, doc):
+        # The set of facts that are found to be landmarks
+        real_answer = doc["answer"]
+        real_answer_yes = [a.lower() for a in real_answer["yes"]]
+
+        for x in ans:
+            if x is None:
+                self.scores.append(False)
+                continue
+            if x.strip().lower() in real_answer_yes:
+                # The answer fact is known to be landmark
+                self.scores.append(True)
+            elif x.strip().lower() == "none":
+                # The answer is none, correct only if there are no known landmarks,
+                #   since we only generate questions when that means that there are no non-trivial landmarks
+                self.scores.append(len(real_answer_yes) == 0)
+            else:
+                # All other cases the answer is incorrect
+                self.scores.append(False)
+
+        return self.get_avg_score()
+
+
+"""
+Next Action task: generate an action that takes us closer to the goal.
+answer:
+    (a) A list of applicable actions that are known to be correct answers
+    (b) A list of applicable actions that are known to be incorrect answers
+    (c) The rest of the applicable actions (maybe).
+"""
+
+
+class NextActionEvaluator(BaseEvaluator):
+    def get_score(self, ans, doc):
+        real_answer = doc["answer"]
+        real_answer_yes = [a.lower() for a in real_answer["yes"]]
+        real_answer_no = [a.lower() for a in real_answer["no"]]
+        real_answer_maybe = [a.lower() for a in real_answer["maybe"]]
+        # The cost of the optimal plan from the current state
+        opt = real_answer.get("opt", None)
+        for x in ans:
+            if x is None:
+                self.scores.append(False)
+                continue
+            action = x.strip().lower()
+            if action in real_answer_yes:
+                # Known to be correct
+                self.scores.append(True)
+            elif action in real_answer_no:
+                # Known to be incorrect
+                self.scores.append(False)
+            elif action not in real_answer_maybe:
+                # Not applicable, must be incorrect
+                self.scores.append(False)
+            else:
+                # Unknown, need to run a planner to check whether the state that results from applying the action is closer to the goal
+                #  meaning has smaller optimal plan cost.
+                self.scores.append(
+                    is_on_optimal_plan(
+                        doc["PDDL_domain"].lower(),
+                        doc["PDDL_problem"].lower(),
+                        action,
+                        opt,
+                    )
+                )
+
+        return self.get_avg_score()
+
+
+"""
+Progression task: generate the positive and negative effects of an action in the current state.
+answer:
+    (a) A list of facts that were false and become true, when the action is applied
+    (b) A list of facts that were true and become false, when the action is applied
+"""
+
+
+class ProgressionEvaluator(BaseEvaluator):
+    def get_score(self, ans, doc):
+        real_answer = doc["answer"]
+        real_answer_pos = [a.lower() for a in real_answer["pos"]]
+        real_answer_neg = [a.lower() for a in real_answer["neg"]]
+
+        for x in ans:
+            # The answer should be two lists. We allow for a single list and assume that the second one is empty (relaxed evaluation).
+            if x is None or len(x) > 2 or len(x) < 1:
+                self.scores.append(False)
+            else:
+                p = cleanup_answer(x[0])
+                if len(x) == 2:
+                    n = cleanup_answer(x[1])
+                else:
+                    # Assuming the last element is dropped because it is empty
+                    n = []
+                # Check if the answer is equal as sets to the correct answers.
+                ans = [set_equal(real_answer_pos, p), set_equal(real_answer_neg, n)]
+                self.scores.append(all(ans))
+
+        return self.get_avg_score()
+
+
+"""
+Reachability task: generate a valid fact that will never become true in any reachable state.
+answer: A subset of facts that are known to be unreachable (not an exhaustive set).
+        It is empty only when we *know* that there are no such facts.
+"""
+
+
+class ReachabilityEvaluator(BaseEvaluator):
+    def get_score(self, ans, doc):
+        real_answer = doc["answer"]
+        real_answer = [f"({x.strip().lower()})" for x in real_answer]
+
+        if len(real_answer) == 0:
+            # The correct answer is None
+            self.add_scores(
+                ["none" == x.strip().lower() if x is not None else False for x in ans]
+            )
+        else:
+            for x in ans:
+                if x is None:
+                    self.scores.append(False)
+                elif x.strip().lower() in real_answer:
+                    # The answer is in the subset of stored correct answers
+                    self.scores.append(True)
+                else:
+                    # Need to run a planner on a task with the answer fact as the new goal
+                    atom = x.strip().lower()
+                    self.scores.append(
+                        is_unsolvable_new_goal(
+                            doc["PDDL_domain"].lower(),
+                            doc["PDDL_problem"].lower(),
+                            atom,
+                        )
+                    )
+
+        return self.get_avg_score()
+
+
+"""
+Validation task: generate an index of the first inapplicable action in the given sequence.
+answer: the correct index.
+"""
+
+
+class ValidationEvaluator(BaseEvaluator):
+    def get_score(self, ans, doc):
+        real_answer = str(doc["answer"])
+        assert int(real_answer) >= 0, (
+            f"The index must be non-negative, received {real_answer}"
+        )
+        # Exact match
+        self.add_scores(
+            [
+                real_answer.lower() == x.strip().lower() if x is not None else False
+                for x in ans
+            ]
+        )
+
+        return self.get_avg_score()
+
+
+##############################################################################
+
+
+def dump_item(item, **kwargs):
+    return json.dumps(item)
+
+
+def parse_prediction(prediction):
+    try:
+        ans = json.loads(prediction.strip())
+        response = ans.get("answer", None)
+        return response
+    except Exception as e:
+        print(f"Exception occurred {e}")
+        return prediction
+
+
+@register_filter("ACP_grammar_filter")
+class ACPGrammarFilter(RegexFilter):
+    """Filtering Index using"""
+
+    def __init__(self, *args, **kwargs):
+        self.parser = ACPGrammarParser(kwargs["grammar_task"])
+        self.clean = kwargs["clean"] if "clean" in kwargs else None
+
+    def clean_pos_neg(self, resp):
+        # Check for Positive Effects and Negative Effects instead of separation
+        if check_prog_response(resp):
+            resp2 = resp.lower()
+            resp2 = resp2.replace("*", "")
+            resp2 = resp2.replace("positive effects", "[")
+            resp2 = resp2.replace("negative effects", "] [")
+            resp2 = resp2 + "]"
+            return resp2
+        return resp
+
+    def clean_simplified_plan(self, resp):
+        # Check for "simplified plan:"
+        if "simplified plan:" in resp.lower():
+            resp2 = resp.lower()
+            resp2 = resp2.replace("*", "")
+            resp2 = resp2.split("simplified plan:")[1]
+            return resp2
+        return resp
+
+    def apply(self, resps, docs):
+        if self.clean == "pos_neg":
+            filtered_resps = [
+                [self.parser.parse(self.clean_pos_neg(r)) for r in resp]
+                for resp in resps
+            ]
+        elif self.clean == "simplified plan":
+            filtered_resps = [
+                [self.parser.parse(self.clean_simplified_plan(r)) for r in resp]
+                for resp in resps
+            ]
+        else:
+            filtered_resps = [[self.parser.parse(r) for r in resp] for resp in resps]
+        return filtered_resps
+
+
+def process_acp_results(doc, results):
+    return {"score": get_evaluator(doc["group"]).get_score(results, doc)}
+
+
+def get_score(references, predictions, **kwargs):
+    # print(f"References: {references}")
+    # print(f"Predictions: {predictions}")
+    data = json.loads(references[0].strip())
+    real_ans = data["answer"]
+    task = data["group"]
+
+    responses = [parse_prediction(prediction) for prediction in predictions]
+
+    print(f"Real answer: {real_ans}")
+    print(f"Model answers: {responses}")
+    parser = ACPGrammarParser(get_grammar_task(task))
+    ans = parse_ans(responses, parser, task)
+
+    print(f"Parsed model answers: {ans}")
+    score = get_evaluator(task).get_score(ans, data)
+
+    return {"get_score": score}
--- a/lm_eval/tasks/acpbench/gen_2shot_with_pddl/act_reach.yaml
+++ b/lm_eval/tasks/acpbench/gen_2shot_with_pddl/act_reach.yaml
+task: acp_areach_gen_with_pddl
+dataset_name: acp_areach_gen
+include: _gen_yaml_2shot
+fewshot_config:
+  sampler: first_n
+  samples:
+  - context: "A robot is in a grid and can only move to places that are connected to its current position. \nThe grid size is 5x5, and the locations are of the form fi-jf (e.g., f3-2f or f0-1f). The grid cells are connected to their neighbors (e.g., f1-2f is connected to the four neighbors f0-2f, f2-2f, f1-1f, and f1-3f). Some positions on the grid are locked and can be opened with a key of a matching shape. The robot has an arm that can pick up a key when the key is in same location as the robot and the arm is empty.  \nThere are 2 keys in 0 different shapes: Key key0-1 is of shape shape0, Key key0-0 is of shape shape0.  \nCurrently, the robot is at position f3-2f and its arm is empty. All the positions are open except the following: f2-0f has shape0 shaped lock, f4-2f has shape0 shaped lock. Key key0-1 is at position f1-3f. Key key0-0 is at position f2-2f. The available actions are: (unlock ?curpos ?lockpos ?key ?shape) - unlock the place ?lockpos with key ?key of shape ?shape from the current position place ?curpos, (move ?curpos ?nextpos) - move to place ?nextpos from place ?curpos, (pickup ?curpos ?key) - acquire the key ?key from the place ?curpos, (pickup-and-loose ?curpos ?newkey ?oldkey) - pick up key ?newkey at current position place ?curpos and loose key ?oldkey being held, and (putdown ?curpos ?key) - put down key ?key at current position place ?curpos."
+    question: "What action can never become applicable, in any state reachable from the current state?"
+    answer: "(pickup-and-loose f0-1f key0-0 key0-0)"
+    PDDL_domain: "(define (domain grid)\n    (:requirements :strips :typing)\n    (:types key place shape - object)\n    (:predicates (arm-empty) (at ?r - key ?x - place)  (at-robot ?x - place)  (conn ?x - place ?y - place)  (holding ?k - key)  (key-shape ?k - key ?s - shape)  (lock-shape ?x - place ?s - shape)  (locked ?x - place)  (open ?x - place))\n    (:action move\n        :parameters (?curpos - place ?nextpos - place)\n        :precondition (and (at-robot ?curpos) (conn ?curpos ?nextpos) (open ?nextpos))\n        :effect (and (at-robot ?nextpos) (not (at-robot ?curpos)))\n    )\n     (:action pickup\n        :parameters (?curpos - place ?key - key)\n        :precondition (and (at-robot ?curpos) (at ?key ?curpos) (arm-empty))\n        :effect (and (holding ?key) (not (at ?key ?curpos)) (not (arm-empty)))\n    )\n     (:action pickup-and-loose\n        :parameters (?curpos - place ?newkey - key ?oldkey - key)\n        :precondition (and (at-robot ?curpos) (holding ?oldkey) (at ?newkey ?curpos))\n        :effect (and (holding ?newkey) (at ?oldkey ?curpos) (not (holding ?oldkey)) (not (at ?newkey ?curpos)))\n    )\n     (:action putdown\n        :parameters (?curpos - place ?key - key)\n        :precondition (and (at-robot ?curpos) (holding ?key))\n        :effect (and (arm-empty) (at ?key ?curpos) (not (holding ?key)))\n    )\n     (:action unlock\n        :parameters (?curpos - place ?lockpos - place ?key - key ?shape - shape)\n        :precondition (and (conn ?curpos ?lockpos) (key-shape ?key ?shape) (lock-shape ?lockpos ?shape) (at-robot ?curpos) (locked ?lockpos) (holding ?key))\n        :effect (and (open ?lockpos) (not (locked ?lockpos)))\n    )\n)"
+    PDDL_problem: "(define (problem grid-x5-y5-t1-k2-l2-p100)\n    (:domain grid)\n    (:requirements :strips :typing)\n    (:objects key0-0 key0-1 - key f0-0f f0-1f f0-2f f0-3f f0-4f f1-0f f1-1f f1-2f f1-3f f1-4f f2-0f f2-1f f2-2f f2-3f f2-4f f3-0f f3-1f f3-2f f3-3f f3-4f f4-0f f4-1f f4-2f f4-3f f4-4f - place shape0 - shape)\n    (:init (arm-empty) (at key0-0 f2-2f) (at key0-1 f1-3f) (at-robot f3-2f) (conn f0-0f f0-1f) (conn f0-0f f1-0f) (conn f0-1f f0-0f) (conn f0-1f f0-2f) (conn f0-1f f1-1f) (conn f0-2f f0-1f) (conn f0-2f f0-3f) (conn f0-2f f1-2f) (conn f0-3f f0-2f) (conn f0-3f f0-4f) (conn f0-3f f1-3f) (conn f0-4f f0-3f) (conn f0-4f f1-4f) (conn f1-0f f0-0f) (conn f1-0f f1-1f) (conn f1-0f f2-0f) (conn f1-1f f0-1f) (conn f1-1f f1-0f) (conn f1-1f f1-2f) (conn f1-1f f2-1f) (conn f1-2f f0-2f) (conn f1-2f f1-1f) (conn f1-2f f1-3f) (conn f1-2f f2-2f) (conn f1-3f f0-3f) (conn f1-3f f1-2f) (conn f1-3f f1-4f) (conn f1-3f f2-3f) (conn f1-4f f0-4f) (conn f1-4f f1-3f) (conn f1-4f f2-4f) (conn f2-0f f1-0f) (conn f2-0f f2-1f) (conn f2-0f f3-0f) (conn f2-1f f1-1f) (conn f2-1f f2-0f) (conn f2-1f f2-2f) (conn f2-1f f3-1f) (conn f2-2f f1-2f) (conn f2-2f f2-1f) (conn f2-2f f2-3f) (conn f2-2f f3-2f) (conn f2-3f f1-3f) (conn f2-3f f2-2f) (conn f2-3f f2-4f) (conn f2-3f f3-3f) (conn f2-4f f1-4f) (conn f2-4f f2-3f) (conn f2-4f f3-4f) (conn f3-0f f2-0f) (conn f3-0f f3-1f) (conn f3-0f f4-0f) (conn f3-1f f2-1f) (conn f3-1f f3-0f) (conn f3-1f f3-2f) (conn f3-1f f4-1f) (conn f3-2f f2-2f) (conn f3-2f f3-1f) (conn f3-2f f3-3f) (conn f3-2f f4-2f) (conn f3-3f f2-3f) (conn f3-3f f3-2f) (conn f3-3f f3-4f) (conn f3-3f f4-3f) (conn f3-4f f2-4f) (conn f3-4f f3-3f) (conn f3-4f f4-4f) (conn f4-0f f3-0f) (conn f4-0f f4-1f) (conn f4-1f f3-1f) (conn f4-1f f4-0f) (conn f4-1f f4-2f) (conn f4-2f f3-2f) (conn f4-2f f4-1f) (conn f4-2f f4-3f) (conn f4-3f f3-3f) (conn f4-3f f4-2f) (conn f4-3f f4-4f) (conn f4-4f f3-4f) (conn f4-4f f4-3f) (key-shape key0-0 shape0) (key-shape key0-1 shape0) (lock-shape f2-0f shape0) (lock-shape f4-2f shape0) (locked f2-0f) (locked f4-2f) (open f0-0f) (open f0-1f) (open f0-2f) (open f0-3f) (open f0-4f) (open f1-0f) (open f1-1f) (open f1-2f) (open f1-3f) (open f1-4f) (open f2-1f) (open f2-2f) (open f2-3f) (open f2-4f) (open f3-0f) (open f3-1f) (open f3-2f) (open f3-3f) (open f3-4f) (open f4-0f) (open f4-1f) (open f4-3f) (open f4-4f))\n    (:goal (and (at key0-0 f2-0f) (at key0-1 f1-3f)))\n)"
+  - context: "There are several cities, each containing several locations, some of which are airports. There are also trucks, which can drive within a single city, and airplanes, which can fly between airports. The goal is to get some packages from various locations to various new locations. \nThere are 2 trucks and 1 airplane, as well as 4 packages. There are 4 locations across 2 cities. \nThe locations are in cities as follows: l1-0 and l1-1 are in c1; l0-0 and l0-1 are in c0. \nCurrently, a0, p1, and p2 are at l1-0, t0 is at l0-1, p3 and p0 are at l0-0, t1 is at l1-1. The available actions are: (load-truck ?obj ?truck ?loc) - place the object ?obj into the truck ?truck at location ?loc, (load-airplane ?obj ?airplane ?loc) - load the object ?obj from location ?loc into the airplane ?airplane, (unload-truck ?obj ?truck ?loc) - unload object ?obj from truck ?truck at location ?loc, (unload-airplane ?obj ?airplane ?loc) - unload object ?obj from airplane ?airplane at location ?loc, (drive-truck ?truck ?loc-from ?loc-to ?city) - navigate the truck ?truck from its current location ?loc-from in city ?city to the new location ?loc-to within the same city, and (fly-airplane ?airplane ?loc-from ?loc-to) - fly airplane ?airplane from airport ?loc-from to airport ?loc-to."
+    question: "What action can never become applicable, in any state reachable from the current state??"
+    answer: "(load-truck p2 t0 l1-1)"
+    PDDL_domain: "(define (domain logistics-strips)\n  (:requirements :strips :typing) \n\n  (:types \n    location locatable city - object \n    package movable - locatable\n    airport - location\n    airplane truck - movable    \n  )\t\t\n  \n  (:predicates \t\n\t\t(at ?obj - locatable ?loc - location)\n\t\t(in ?obj1 - package ?obj2 - movable)\n\t\t(in-city ?obj - location ?city - city))\n\n\n(:action LOAD-TRUCK\n  :parameters\n   (?obj - package\n    ?truck - truck\n    ?loc - location)\n  :precondition\n   (and \n   (at ?truck ?loc) (at ?obj ?loc))\n  :effect\n   (and (not (at ?obj ?loc)) (in ?obj ?truck)))\n\n(:action LOAD-AIRPLANE\n  :parameters\n   (?obj - package\n    ?airplane - airplane\n    ?loc - location)\n  :precondition\n   (and \n   (at ?obj ?loc) (at ?airplane ?loc))\n  :effect\n   (and (not (at ?obj ?loc)) (in ?obj ?airplane)))\n\n\n\n(:action UNLOAD-TRUCK\n  :parameters\n   (?obj - package\n    ?truck - truck\n    ?loc - location)\n  :precondition\n   (and \n        (at ?truck ?loc) (in ?obj ?truck))\n  :effect\n   (and (not (in ?obj ?truck)) (at ?obj ?loc)))\n\n(:action UNLOAD-AIRPLANE\n  :parameters\n   (?obj - package\n    ?airplane - airplane\n    ?loc - location)\n  :precondition\n   (and \n        (in ?obj ?airplane) (at ?airplane ?loc))\n  :effect\n   (and (not (in ?obj ?airplane)) (at ?obj ?loc)))\n\n(:action DRIVE-TRUCK\n  :parameters\n   (?truck - truck\n    ?loc-from - location\n    ?loc-to - location\n    ?city - city)\n  :precondition\n   (and \n   (at ?truck ?loc-from)\n   (in-city ?loc-from ?city)\n   (in-city ?loc-to ?city))\n  :effect\n   (and (not (at ?truck ?loc-from)) (at ?truck ?loc-to)))\n\n(:action FLY-AIRPLANE\n  :parameters\n   (?airplane - airplane\n    ?loc-from - airport\n    ?loc-to - airport)\n  :precondition\n   (and \n\t(at ?airplane ?loc-from))\n  :effect\n   (and (not (at ?airplane ?loc-from)) (at ?airplane ?loc-to)))\n)"
+    PDDL_problem: "(define (problem logistics-c2-s2-p4-a1)\n    (:domain logistics-strips)\n    (:requirements :strips :typing)\n    (:objects a0 - airplane l0-0 l1-0 - airport c0 c1 - city l0-1 l1-1 - location p0 p1 p2 p3 - package t0 t1 - truck)\n    (:init (at a0 l1-0) (at p0 l0-0) (at p1 l1-0) (at p2 l1-0) (at p3 l0-0) (at t0 l0-1) (at t1 l1-1) (in-city l0-0 c0) (in-city l0-1 c0) (in-city l1-0 c1) (in-city l1-1 c1))\n    (:goal (and (at p0 l0-0) (at p1 l1-0) (at p2 l1-0) (at p3 l0-1)))\n)"
+doc_to_text: "# PDDL DOMAIN \n\n```\n{{PDDL_domain}}\n```\n\n# PDDL PROBLEM \n\n```\n{{PDDL_problem}}\n```\n\n**Question**: {{context}} {{question}} Each action starts with an opening parenthesis and ends with closing parenthesis. Provide one action or None. **Final Answer**:"
+filter_list:
+  - name: "acp_grammar_parse"
+    filter:
+      - function: "ACP_grammar_filter"
+        grammar_task: "act"
+      - function: "take_first"
--- a/lm_eval/tasks/acpbench/gen_2shot_with_pddl/app.yaml
+++ b/lm_eval/tasks/acpbench/gen_2shot_with_pddl/app.yaml
+task: acp_app_gen_with_pddl
+dataset_name: acp_app_gen
+include: _gen_yaml_2shot
+fewshot_config:
+  sampler: first_n
+  samples:
+  - context: "A robot is in a grid and can only move to places that are connected to its current position. \nThe grid size is 5x5, and the locations are of the form fi-jf (e.g., f3-2f or f0-1f). The grid cells are connected to their neighbors (e.g., f1-2f is connected to the four neighbors f0-2f, f2-2f, f1-1f, and f1-3f). Some positions on the grid are locked and can be opened with a key of a matching shape. The robot has an arm that can pick up a key when the key is in same location as the robot and the arm is empty.  \nThere are 2 keys in 0 different shapes: Key key0-0 is of shape shape0, Key key0-1 is of shape shape0.  \nCurrently, the robot is at position f4-3f and its arm is empty. All the positions are open except the following: f2-0f has shape0 shaped lock, f4-2f has shape0 shaped lock. Key key0-0 is at position f3-1f. Key key0-1 is at position f1-3f. The available actions are: (unlock ?curpos ?lockpos ?key ?shape) - use the key ?key of shape ?shape to unlock the place ?lockpos from the current position ?curpos, (move ?curpos ?nextpos) - transition from the current position ?curpos to the next position ?nextpos, (pickup ?curpos ?key) - pick up key ?key from place ?curpos, (pickup-and-loose ?curpos ?newkey ?oldkey) - pick up the key ?newkey from the current position ?curpos and loose the key ?oldkey which is being held, and (putdown ?curpos ?key) - place the key ?key at the current position ?curpos."
+    question: "Generate the list of all ground actions that are applicable in this state."
+    answer: "[(move f4-3f f3-3f), (move f4-3f f4-4f)]"
+    PDDL_domain: "(define (domain grid)\n    (:requirements :strips :typing)\n    (:types key place shape - object)\n    (:predicates (arm-empty) (at ?r - key ?x - place)  (at-robot ?x - place)  (conn ?x - place ?y - place)  (holding ?k - key)  (key-shape ?k - key ?s - shape)  (lock-shape ?x - place ?s - shape)  (locked ?x - place)  (open ?x - place))\n    (:action move\n        :parameters (?curpos - place ?nextpos - place)\n        :precondition (and (at-robot ?curpos) (conn ?curpos ?nextpos) (open ?nextpos))\n        :effect (and (at-robot ?nextpos) (not (at-robot ?curpos)))\n    )\n     (:action pickup\n        :parameters (?curpos - place ?key - key)\n        :precondition (and (at-robot ?curpos) (at ?key ?curpos) (arm-empty))\n        :effect (and (holding ?key) (not (at ?key ?curpos)) (not (arm-empty)))\n    )\n     (:action pickup-and-loose\n        :parameters (?curpos - place ?newkey - key ?oldkey - key)\n        :precondition (and (at-robot ?curpos) (holding ?oldkey) (at ?newkey ?curpos))\n        :effect (and (holding ?newkey) (at ?oldkey ?curpos) (not (holding ?oldkey)) (not (at ?newkey ?curpos)))\n    )\n     (:action putdown\n        :parameters (?curpos - place ?key - key)\n        :precondition (and (at-robot ?curpos) (holding ?key))\n        :effect (and (arm-empty) (at ?key ?curpos) (not (holding ?key)))\n    )\n     (:action unlock\n        :parameters (?curpos - place ?lockpos - place ?key - key ?shape - shape)\n        :precondition (and (conn ?curpos ?lockpos) (key-shape ?key ?shape) (lock-shape ?lockpos ?shape) (at-robot ?curpos) (locked ?lockpos) (holding ?key))\n        :effect (and (open ?lockpos) (not (locked ?lockpos)))\n    )\n)"
+    PDDL_problem: "(define (problem grid-x5-y5-t1-k2-l2-p100)\n    (:domain grid)\n    (:requirements :strips :typing)\n    (:objects key0-0 key0-1 - key f0-0f f0-1f f0-2f f0-3f f0-4f f1-0f f1-1f f1-2f f1-3f f1-4f f2-0f f2-1f f2-2f f2-3f f2-4f f3-0f f3-1f f3-2f f3-3f f3-4f f4-0f f4-1f f4-2f f4-3f f4-4f - place shape0 - shape)\n    (:init (arm-empty) (at key0-0 f3-1f) (at key0-1 f1-3f) (at-robot f4-3f) (conn f0-0f f0-1f) (conn f0-0f f1-0f) (conn f0-1f f0-0f) (conn f0-1f f0-2f) (conn f0-1f f1-1f) (conn f0-2f f0-1f) (conn f0-2f f0-3f) (conn f0-2f f1-2f) (conn f0-3f f0-2f) (conn f0-3f f0-4f) (conn f0-3f f1-3f) (conn f0-4f f0-3f) (conn f0-4f f1-4f) (conn f1-0f f0-0f) (conn f1-0f f1-1f) (conn f1-0f f2-0f) (conn f1-1f f0-1f) (conn f1-1f f1-0f) (conn f1-1f f1-2f) (conn f1-1f f2-1f) (conn f1-2f f0-2f) (conn f1-2f f1-1f) (conn f1-2f f1-3f) (conn f1-2f f2-2f) (conn f1-3f f0-3f) (conn f1-3f f1-2f) (conn f1-3f f1-4f) (conn f1-3f f2-3f) (conn f1-4f f0-4f) (conn f1-4f f1-3f) (conn f1-4f f2-4f) (conn f2-0f f1-0f) (conn f2-0f f2-1f) (conn f2-0f f3-0f) (conn f2-1f f1-1f) (conn f2-1f f2-0f) (conn f2-1f f2-2f) (conn f2-1f f3-1f) (conn f2-2f f1-2f) (conn f2-2f f2-1f) (conn f2-2f f2-3f) (conn f2-2f f3-2f) (conn f2-3f f1-3f) (conn f2-3f f2-2f) (conn f2-3f f2-4f) (conn f2-3f f3-3f) (conn f2-4f f1-4f) (conn f2-4f f2-3f) (conn f2-4f f3-4f) (conn f3-0f f2-0f) (conn f3-0f f3-1f) (conn f3-0f f4-0f) (conn f3-1f f2-1f) (conn f3-1f f3-0f) (conn f3-1f f3-2f) (conn f3-1f f4-1f) (conn f3-2f f2-2f) (conn f3-2f f3-1f) (conn f3-2f f3-3f) (conn f3-2f f4-2f) (conn f3-3f f2-3f) (conn f3-3f f3-2f) (conn f3-3f f3-4f) (conn f3-3f f4-3f) (conn f3-4f f2-4f) (conn f3-4f f3-3f) (conn f3-4f f4-4f) (conn f4-0f f3-0f) (conn f4-0f f4-1f) (conn f4-1f f3-1f) (conn f4-1f f4-0f) (conn f4-1f f4-2f) (conn f4-2f f3-2f) (conn f4-2f f4-1f) (conn f4-2f f4-3f) (conn f4-3f f3-3f) (conn f4-3f f4-2f) (conn f4-3f f4-4f) (conn f4-4f f3-4f) (conn f4-4f f4-3f) (key-shape key0-0 shape0) (key-shape key0-1 shape0) (lock-shape f2-0f shape0) (lock-shape f4-2f shape0) (locked f2-0f) (locked f4-2f) (open f0-0f) (open f0-1f) (open f0-2f) (open f0-3f) (open f0-4f) (open f1-0f) (open f1-1f) (open f1-2f) (open f1-3f) (open f1-4f) (open f2-1f) (open f2-2f) (open f2-3f) (open f2-4f) (open f3-0f) (open f3-1f) (open f3-2f) (open f3-3f) (open f3-4f) (open f4-0f) (open f4-1f) (open f4-3f) (open f4-4f))\n    (:goal (and (at key0-0 f2-0f) (at key0-1 f1-3f)))\n)"
+  - context: "There are several cities, each containing several locations, some of which are airports. There are also trucks, which can drive within a single city, and airplanes, which can fly between airports. The goal is to get some packages from various locations to various new locations. \nThere are 2 trucks and 1 airplane, as well as 4 packages. There are 4 locations across 2 cities. \nThe locations are in cities as follows: l0-1 and l0-0 are in c0; l1-1 and l1-0 are in c1. \nCurrently, t1 is at l1-0, p0, a0, t0, and p3 are at l0-0, p1 and p2 are in t1. The available actions are: (load-truck ?obj ?truck ?loc) - load the object ?obj from location ?loc into the truck ?truck, (load-airplane ?obj ?airplane ?loc) - place the object ?obj onto the airplane ?airplane at location ?loc, (unload-truck ?obj ?truck ?loc) - remove the object ?obj from the truck ?truck and place it on the location ?loc, (unload-airplane ?obj ?airplane ?loc) - unload object ?obj from airplane ?airplane at location ?loc, (drive-truck ?truck ?loc-from ?loc-to ?city) - navigate the truck ?truck from location ?loc-from in city ?city to location ?loc-to in the same city, and (fly-airplane ?airplane ?loc-from ?loc-to) - fly airplane ?airplane from airport ?loc-from to airport ?loc-to."
+    question: "Generate the list of all ground actions that are applicable in this state."
+    answer: "[(unload-truck p2 t1 l1-0), (drive-truck t0 l0-0 l0-0 c0), (load-airplane p0 a0 l0-0), (load-truck p0 t0 l0-0), (unload-truck p1 t1 l1-0), (drive-truck t1 l1-0 l1-0 c1), (drive-truck t0 l0-0 l0-1 c0), (drive-truck t1 l1-0 l1-1 c1), (fly-airplane a0 l0-0 l0-0), (load-truck p3 t0 l0-0), (fly-airplane a0 l0-0 l1-0), (load-airplane p3 a0 l0-0)]"
+    PDDL_domain: "(define (domain logistics-strips)\n  (:requirements :strips :typing) \n\n  (:types \n    location locatable city - object \n    package movable - locatable\n    airport - location\n    airplane truck - movable    \n  )\t\t\n  \n  (:predicates \t\n\t\t(at ?obj - locatable ?loc - location)\n\t\t(in ?obj1 - package ?obj2 - movable)\n\t\t(in-city ?obj - location ?city - city))\n\n\n(:action LOAD-TRUCK\n  :parameters\n   (?obj - package\n    ?truck - truck\n    ?loc - location)\n  :precondition\n   (and \n   (at ?truck ?loc) (at ?obj ?loc))\n  :effect\n   (and (not (at ?obj ?loc)) (in ?obj ?truck)))\n\n(:action LOAD-AIRPLANE\n  :parameters\n   (?obj - package\n    ?airplane - airplane\n    ?loc - location)\n  :precondition\n   (and \n   (at ?obj ?loc) (at ?airplane ?loc))\n  :effect\n   (and (not (at ?obj ?loc)) (in ?obj ?airplane)))\n\n\n\n(:action UNLOAD-TRUCK\n  :parameters\n   (?obj - package\n    ?truck - truck\n    ?loc - location)\n  :precondition\n   (and \n        (at ?truck ?loc) (in ?obj ?truck))\n  :effect\n   (and (not (in ?obj ?truck)) (at ?obj ?loc)))\n\n(:action UNLOAD-AIRPLANE\n  :parameters\n   (?obj - package\n    ?airplane - airplane\n    ?loc - location)\n  :precondition\n   (and \n        (in ?obj ?airplane) (at ?airplane ?loc))\n  :effect\n   (and (not (in ?obj ?airplane)) (at ?obj ?loc)))\n\n(:action DRIVE-TRUCK\n  :parameters\n   (?truck - truck\n    ?loc-from - location\n    ?loc-to - location\n    ?city - city)\n  :precondition\n   (and \n   (at ?truck ?loc-from)\n   (in-city ?loc-from ?city)\n   (in-city ?loc-to ?city))\n  :effect\n   (and (not (at ?truck ?loc-from)) (at ?truck ?loc-to)))\n\n(:action FLY-AIRPLANE\n  :parameters\n   (?airplane - airplane\n    ?loc-from - airport\n    ?loc-to - airport)\n  :precondition\n   (and \n\t(at ?airplane ?loc-from))\n  :effect\n   (and (not (at ?airplane ?loc-from)) (at ?airplane ?loc-to)))\n)"
+    PDDL_problem: "(define (problem logistics-c2-s2-p4-a1)\n    (:domain logistics-strips)\n    (:requirements :strips :typing)\n    (:objects a0 - airplane l0-0 l1-0 - airport c0 c1 - city l0-1 l1-1 - location p0 p1 p2 p3 - package t0 t1 - truck)\n    (:init (at a0 l0-0) (at p0 l0-0) (at p3 l0-0) (at t0 l0-0) (at t1 l1-0) (in p1 t1) (in p2 t1) (in-city l0-0 c0) (in-city l0-1 c0) (in-city l1-0 c1) (in-city l1-1 c1))\n    (:goal (and (at p0 l0-0) (at p1 l1-0) (at p2 l1-0) (at p3 l0-1)))\n)"
+doc_to_text: "# PDDL DOMAIN \n\n```\n{{PDDL_domain}}\n```\n\n# PDDL PROBLEM \n\n```\n{{PDDL_problem}}\n```\n\n**Question**: {{context}} {{question}} Each action starts with an opening parenthesis and ends with closing parenthesis. Provide only the actions. \n**Final Answer**:"
+filter_list:
+  - name: "acp_grammar_parse"
+    filter:
+      - function: "ACP_grammar_filter"
+        grammar_task: "action_list"
+      - function: "take_first"