"configs/wan/wan_t2v_dist.json" did not exist on "ae96fdbf2b7c527ccd2f05f94b54b89e93fd2437"
Commit 8fada609 authored by Baber's avatar Baber
Browse files

Merge branch 'main' into mathvista

parents 0007b74a 1208afd3
...@@ -76,9 +76,9 @@ class VLLM(TemplateLM): ...@@ -76,9 +76,9 @@ class VLLM(TemplateLM):
) )
assert "cuda" in device or device is None, "vLLM only supports CUDA" assert "cuda" in device or device is None, "vLLM only supports CUDA"
assert ( assert max_length is None or max_model_len is None, (
max_length is None or max_model_len is None "Either max_length or max_model_len may be provided, but not both"
), "Either max_length or max_model_len may be provided, but not both" )
self._max_length = max_model_len if max_model_len is not None else max_length self._max_length = max_model_len if max_model_len is not None else max_length
self.tensor_parallel_size = int(tensor_parallel_size) self.tensor_parallel_size = int(tensor_parallel_size)
...@@ -142,9 +142,9 @@ class VLLM(TemplateLM): ...@@ -142,9 +142,9 @@ class VLLM(TemplateLM):
self._max_gen_toks = max_gen_toks self._max_gen_toks = max_gen_toks
if lora_local_path is not None: if lora_local_path is not None:
assert parse_version(version("vllm")) > parse_version( assert parse_version(version("vllm")) > parse_version("0.3.0"), (
"0.3.0" "lora adapters only compatible with vllm > v0.3.0."
), "lora adapters only compatible with vllm > v0.3.0." )
self.lora_request = LoRARequest("finetuned", 1, lora_local_path) self.lora_request = LoRARequest("finetuned", 1, lora_local_path)
else: else:
self.lora_request = None self.lora_request = None
......
...@@ -271,7 +271,9 @@ class VLLM_VLM(VLLM): ...@@ -271,7 +271,9 @@ class VLLM_VLM(VLLM):
left_truncate_len=max_ctx_len, left_truncate_len=max_ctx_len,
) )
cont = self._model_generate(inputs, stop=until, generate=True, **kwargs) cont = self._model_generate(
inputs, stop=until, generate=True, max_tokens=max_gen_toks, **kwargs
)
for output, context in zip(cont, contexts): for output, context in zip(cont, contexts):
generated_text = output.outputs[0].text generated_text = output.outputs[0].text
......
...@@ -6,7 +6,7 @@ ...@@ -6,7 +6,7 @@
For more information, including a full list of task names and their precise meanings or sources, follow the links provided to the individual README.md files for each subfolder. For more information, including a full list of task names and their precise meanings or sources, follow the links provided to the individual README.md files for each subfolder.
| Task Family | Description | Language(s) | | Task Family | Description | Language(s) |
|-------------|-------------|-------------| |--------------------------------------------------------------------------|-------------|-------------------------------------------------------------------------------------------------------------------------------|
| [aclue](aclue/README.md) | Tasks focusing on ancient Chinese language understanding and cultural aspects. | Ancient Chinese | | [aclue](aclue/README.md) | Tasks focusing on ancient Chinese language understanding and cultural aspects. | Ancient Chinese |
| [aexams](aexams/README.md) | Tasks in Arabic related to various academic exams covering a range of subjects. | Arabic | | [aexams](aexams/README.md) | Tasks in Arabic related to various academic exams covering a range of subjects. | Arabic |
| [agieval](agieval/README.md) | Tasks involving historical data or questions related to history and historical texts. | English, Chinese | | [agieval](agieval/README.md) | Tasks involving historical data or questions related to history and historical texts. | English, Chinese |
...@@ -44,7 +44,7 @@ ...@@ -44,7 +44,7 @@
| [eus_trivia](eus_trivia/README.md) | Trivia and knowledge testing tasks in the Basque language. | Basque | | [eus_trivia](eus_trivia/README.md) | Trivia and knowledge testing tasks in the Basque language. | Basque |
| [fda](fda/README.md) | Tasks for extracting key-value pairs from FDA documents to test information extraction. | English | | [fda](fda/README.md) | Tasks for extracting key-value pairs from FDA documents to test information extraction. | English |
| [fld](fld/README.md) | Tasks involving free-form and directed dialogue understanding. | English | | [fld](fld/README.md) | Tasks involving free-form and directed dialogue understanding. | English |
| [french_bench](french_bench/README.md) | Set of tasks designed to assess language model performance in French. | French| | [french_bench](french_bench/README.md) | Set of tasks designed to assess language model performance in French. | French |
| [galician_bench](galician_bench/README.md) | Collection of tasks in Galician encompassing various evaluation areas. | Galician | | [galician_bench](galician_bench/README.md) | Collection of tasks in Galician encompassing various evaluation areas. | Galician |
| [global_mmlu](global_mmlu/README.md) | Collection of culturally sensitive and culturally agnostic MMLU tasks in 15 languages with human translations or post-edits. | Multiple (15 languages) | | [global_mmlu](global_mmlu/README.md) | Collection of culturally sensitive and culturally agnostic MMLU tasks in 15 languages with human translations or post-edits. | Multiple (15 languages) |
| [glue](glue/README.md) | General Language Understanding Evaluation benchmark to test broad language abilities. | English | | [glue](glue/README.md) | General Language Understanding Evaluation benchmark to test broad language abilities. | English |
...@@ -55,6 +55,8 @@ ...@@ -55,6 +55,8 @@
| [hellaswag](hellaswag/README.md) | Tasks to predict the ending of stories or scenarios, testing comprehension and creativity. | English | | [hellaswag](hellaswag/README.md) | Tasks to predict the ending of stories or scenarios, testing comprehension and creativity. | English |
| [hendrycks_ethics](hendrycks_ethics/README.md) | Tasks designed to evaluate the ethical reasoning capabilities of models. | English | | [hendrycks_ethics](hendrycks_ethics/README.md) | Tasks designed to evaluate the ethical reasoning capabilities of models. | English |
| [hendrycks_math](hendrycks_math/README.md) | Mathematical problem-solving tasks to test numerical reasoning and problem-solving. | English | | [hendrycks_math](hendrycks_math/README.md) | Mathematical problem-solving tasks to test numerical reasoning and problem-solving. | English |
| [histoires_morales](histoires_morales/README.md) | A dataset of structured narratives that describe normative and norm-divergent actions taken by individuals to accomplish certain intentions in concrete situations. | French (Some MT) |
| [hrm8k](hrm8k/README.md) | A challenging bilingual math reasoning benchmark for Korean and English. | Korean (Some MT), English (Some MT) |
| [humaneval](humaneval/README.md) | Code generation task that measure functional correctness for synthesizing programs from docstrings. | Python | | [humaneval](humaneval/README.md) | Code generation task that measure functional correctness for synthesizing programs from docstrings. | Python |
| [ifeval](ifeval/README.md) | Interactive fiction evaluation tasks for narrative understanding and reasoning. | English | | [ifeval](ifeval/README.md) | Interactive fiction evaluation tasks for narrative understanding and reasoning. | English |
| [inverse_scaling](inverse_scaling/README.md) | Multiple-choice tasks from the Inverse Scaling Prize, designed to find settings where larger language models perform worse. | English | | [inverse_scaling](inverse_scaling/README.md) | Multiple-choice tasks from the Inverse Scaling Prize, designed to find settings where larger language models perform worse. | English |
...@@ -85,6 +87,7 @@ ...@@ -85,6 +87,7 @@
| [mmlu_pro](mmlu_pro/README.md) | A refined set of MMLU, integrating more challenging, reasoning-focused questions and expanding the choice set from four to ten options. | English | | [mmlu_pro](mmlu_pro/README.md) | A refined set of MMLU, integrating more challenging, reasoning-focused questions and expanding the choice set from four to ten options. | English |
| [mmlusr](mmlusr/README.md) | Variation of MMLU designed to be more rigorous. | English | | [mmlusr](mmlusr/README.md) | Variation of MMLU designed to be more rigorous. | English |
| model_written_evals | Evaluation tasks auto-generated for evaluating a collection of AI Safety concerns. | | | model_written_evals | Evaluation tasks auto-generated for evaluating a collection of AI Safety concerns. | |
| [moral_stories](moral_stories/README.md) | A crowd-sourced dataset of structured narratives that describe normative and norm-divergent actions taken by individuals to accomplish certain intentions in concrete situations. | English
| [mutual](mutual/README.md) | A retrieval-based dataset for multi-turn dialogue reasoning. | English | | [mutual](mutual/README.md) | A retrieval-based dataset for multi-turn dialogue reasoning. | English |
| [nq_open](nq_open/README.md) | Open domain question answering tasks based on the Natural Questions dataset. | English | | [nq_open](nq_open/README.md) | Open domain question answering tasks based on the Natural Questions dataset. | English |
| [okapi/arc_multilingual](okapi/arc_multilingual/README.md) | Tasks that involve reading comprehension and information retrieval challenges. | Multiple (31 languages) **Machine Translated.** | | [okapi/arc_multilingual](okapi/arc_multilingual/README.md) | Tasks that involve reading comprehension and information retrieval challenges. | Multiple (31 languages) **Machine Translated.** |
......
...@@ -9,4 +9,4 @@ aggregate_metric_list: ...@@ -9,4 +9,4 @@ aggregate_metric_list:
- metric: acc - metric: acc
weight_by_size: True weight_by_size: True
metadata: metadata:
version: 0 version: 1
...@@ -6,4 +6,4 @@ aggregate_metric_list: ...@@ -6,4 +6,4 @@ aggregate_metric_list:
- metric: acc - metric: acc
weight_by_size: True weight_by_size: True
metadata: metadata:
version: 0 version: 1
...@@ -6,4 +6,4 @@ aggregate_metric_list: ...@@ -6,4 +6,4 @@ aggregate_metric_list:
- metric: acc - metric: acc
weight_by_size: True weight_by_size: True
metadata: metadata:
version: 0 version: 1
...@@ -6,4 +6,4 @@ aggregate_metric_list: ...@@ -6,4 +6,4 @@ aggregate_metric_list:
- metric: acc - metric: acc
weight_by_size: True weight_by_size: True
metadata: metadata:
version: 0 version: 1
...@@ -6,4 +6,4 @@ aggregate_metric_list: ...@@ -6,4 +6,4 @@ aggregate_metric_list:
- metric: acc - metric: acc
weight_by_size: True weight_by_size: True
metadata: metadata:
version: 0 version: 1
...@@ -6,4 +6,4 @@ aggregate_metric_list: ...@@ -6,4 +6,4 @@ aggregate_metric_list:
- metric: acc - metric: acc
weight_by_size: True weight_by_size: True
metadata: metadata:
version: 0 version: 1
dataset_path: yazeed7/ArabicMMLU dataset_path: MBZUAI/ArabicMMLU
test_split: test test_split: test
fewshot_split: dev fewshot_split: dev
fewshot_config: fewshot_config:
...@@ -12,4 +12,4 @@ metric_list: ...@@ -12,4 +12,4 @@ metric_list:
aggregation: mean aggregation: mean
higher_is_better: true higher_is_better: true
metadata: metadata:
version: 0.0 version: 1.0
...@@ -14,46 +14,46 @@ eval_logger = logging.getLogger("lm-eval") ...@@ -14,46 +14,46 @@ eval_logger = logging.getLogger("lm-eval")
SUBJECTS = { SUBJECTS = {
"Driving Test": "other",
"High Geography": "social_science",
"High History": "humanities",
"Islamic Studies": "humanities", "Islamic Studies": "humanities",
"Univ Accounting": "social_science", "Driving Test": "other",
"Primary General Knowledge": "other", "Natural Science (Middle School)": "stem",
"Univ Political Science": "social_science", "Natural Science (Primary School)": "stem",
"Primary Math": "stem", "History (Primary School)": "humanities",
"Middle General Knowledge": "other", "History (Middle School)": "humanities",
"High Biology": "stem", "History (High School)": "humanities",
"Primary Natural Science": "stem",
"High Economics": "social_science",
"Middle Natural Science": "stem",
"Middle Geography": "social_science",
"Primary Social Science": "social_science",
"Middle Computer Science": "stem",
"Middle Islamic Studies": "humanities",
"Primary Computer Science": "stem",
"High Physics": "stem",
"Middle Social Science": "social_science",
"Middle Civics": "social_science",
"High Computer Science": "stem",
"General Knowledge": "other", "General Knowledge": "other",
"High Civics": "social_science", "General Knowledge (Primary School)": "other",
"Prof Law": "humanities", "General Knowledge (Middle School)": "other",
"High Islamic Studies": "humanities", "Law (Professional)": "humanities",
"Primary Arabic Language": "language", "Physics (High School)": "stem",
"High Arabic Language": "language", "Social Science (Middle School)": "social_science",
"Arabic Language (Grammar)": "language", "Social Science (Primary School)": "social_science",
"Primary History": "humanities", "Management (University)": "other",
"Middle History": "humanities", "Arabic Language (Primary School)": "language",
"Univ Economics": "social_science", "Arabic Language (Middle School)": "language",
"Arabic Language (High School)": "language",
"Political Science (University)": "social_science",
"Philosophy (High School)": "humanities",
"Accounting (University)": "social_science",
"Computer Science (University)": "stem",
"Computer Science (Middle School)": "stem",
"Computer Science (Primary School)": "stem",
"Computer Science (High School)": "stem",
"Geography (Primary School)": "social_science",
"Geography (Middle School)": "social_science",
"Geography (High School)": "social_science",
"Math (Primary School)": "stem",
"Biology (High School)": "stem",
"Economics (University)": "social_science",
"Economics (Middle School)": "social_science",
"Economics (High School)": "social_science",
"Arabic Language (General)": "language", "Arabic Language (General)": "language",
"Univ Computer Science": "stem", "Arabic Language (Grammar)": "language",
"Primary Islamic Studies": "humanities", "Islamic Studies (High School)": "humanities",
"Primary Geography": "social_science", "Islamic Studies (Middle School)": "humanities",
"High Philosophy": "humanities", "Islamic Studies (Primary School)": "humanities",
"Middle Arabic Language": "language", "Civics (Middle School)": "social_science",
"Middle Economics": "social_science", "Civics (High School)": "social_science",
"Univ Management": "other",
} }
...@@ -69,8 +69,9 @@ if __name__ == "__main__": ...@@ -69,8 +69,9 @@ if __name__ == "__main__":
# get filename of base_yaml so we can `"include": ` it in our "other" YAMLs. # get filename of base_yaml so we can `"include": ` it in our "other" YAMLs.
base_yaml_name = os.path.split(args.base_yaml_path)[-1] base_yaml_name = os.path.split(args.base_yaml_path)[-1]
with open(args.base_yaml_path, encoding="utf-8") as f:
base_yaml = yaml.full_load(f) # with open(args.base_yaml_path, encoding="utf-8") as f:
# base_yaml = yaml.full_load(f)
ALL_CATEGORIES = [] ALL_CATEGORIES = []
for subject, category in tqdm(SUBJECTS.items()): for subject, category in tqdm(SUBJECTS.items()):
...@@ -81,8 +82,8 @@ if __name__ == "__main__": ...@@ -81,8 +82,8 @@ if __name__ == "__main__":
yaml_dict = { yaml_dict = {
"include": base_yaml_name, "include": base_yaml_name,
"tag": f"arabicmmlu_{category}", "tag": f"arabicmmlu_{category}_tasks",
"task": f"arabicmmlu_{subject.lower().replace(' ', '_')}", "task": f"arabicmmlu_{subject.lower().replace(' ', '_').replace('(', '').replace(')', '')}",
"task_alias": subject, "task_alias": subject,
"dataset_name": subject, "dataset_name": subject,
# "description": description, # "description": description,
......
"dataset_name": "Middle Civics" "dataset_name": "Accounting (University)"
"tag": "arabicmmlu_social_science_tasks"
"include": "_default_arabicmmlu_template_yaml" "include": "_default_arabicmmlu_template_yaml"
"task": "arabicmmlu_middle_civics" "tag": "arabicmmlu_social_science_tasks"
"task_alias": "Middle Civics" "task": "arabicmmlu_accounting_university"
"task_alias": "Accounting (University)"
"dataset_name": "Arabic Language (General)" "dataset_name": "Arabic Language (General)"
"tag": "arabicmmlu_language_tasks"
"include": "_default_arabicmmlu_template_yaml" "include": "_default_arabicmmlu_template_yaml"
"task": "arabicmmlu_arabic_language_(general)" "tag": "arabicmmlu_language_tasks"
"task": "arabicmmlu_arabic_language_general"
"task_alias": "Arabic Language (General)" "task_alias": "Arabic Language (General)"
"dataset_name": "Arabic Language (Grammar)" "dataset_name": "Arabic Language (Grammar)"
"tag": "arabicmmlu_language_tasks"
"include": "_default_arabicmmlu_template_yaml" "include": "_default_arabicmmlu_template_yaml"
"task": "arabicmmlu_arabic_language_(grammar)" "tag": "arabicmmlu_language_tasks"
"task": "arabicmmlu_arabic_language_grammar"
"task_alias": "Arabic Language (Grammar)" "task_alias": "Arabic Language (Grammar)"
"dataset_name": "High Arabic Language" "dataset_name": "Arabic Language (High School)"
"tag": "arabicmmlu_language_tasks"
"include": "_default_arabicmmlu_template_yaml" "include": "_default_arabicmmlu_template_yaml"
"task": "arabicmmlu_high_arabic_language" "tag": "arabicmmlu_language_tasks"
"task_alias": "High Arabic Language" "task": "arabicmmlu_arabic_language_high_school"
"task_alias": "Arabic Language (High School)"
"dataset_name": "Middle Arabic Language" "dataset_name": "Arabic Language (Middle School)"
"tag": "arabicmmlu_language_tasks"
"include": "_default_arabicmmlu_template_yaml" "include": "_default_arabicmmlu_template_yaml"
"task": "arabicmmlu_middle_arabic_language" "tag": "arabicmmlu_language_tasks"
"task_alias": "Middle Arabic Language" "task": "arabicmmlu_arabic_language_middle_school"
"task_alias": "Arabic Language (Middle School)"
"dataset_name": "Primary Arabic Language" "dataset_name": "Arabic Language (Primary School)"
"tag": "arabicmmlu_language_tasks"
"include": "_default_arabicmmlu_template_yaml" "include": "_default_arabicmmlu_template_yaml"
"task": "arabicmmlu_primary_arabic_language" "tag": "arabicmmlu_language_tasks"
"task_alias": "Primary Arabic Language" "task": "arabicmmlu_arabic_language_primary_school"
"task_alias": "Arabic Language (Primary School)"
"dataset_name": "High Physics" "dataset_name": "Biology (High School)"
"tag": "arabicmmlu_stem_tasks"
"include": "_default_arabicmmlu_template_yaml" "include": "_default_arabicmmlu_template_yaml"
"task": "arabicmmlu_high_physics" "tag": "arabicmmlu_stem_tasks"
"task_alias": "High Physics" "task": "arabicmmlu_biology_high_school"
"task_alias": "Biology (High School)"
"dataset_name": "High Economics" "dataset_name": "Civics (High School)"
"tag": "arabicmmlu_social_science_tasks"
"include": "_default_arabicmmlu_template_yaml" "include": "_default_arabicmmlu_template_yaml"
"task": "arabicmmlu_high_economics" "tag": "arabicmmlu_social_science_tasks"
"task_alias": "High Economics" "task": "arabicmmlu_civics_high_school"
"task_alias": "Civics (High School)"
"dataset_name": "High Geography" "dataset_name": "Civics (Middle School)"
"tag": "arabicmmlu_social_science_tasks"
"include": "_default_arabicmmlu_template_yaml" "include": "_default_arabicmmlu_template_yaml"
"task": "arabicmmlu_high_geography" "tag": "arabicmmlu_social_science_tasks"
"task_alias": "High Geography" "task": "arabicmmlu_civics_middle_school"
"task_alias": "Civics (Middle School)"
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment