group: bbh task: - bbh_cot_fewshot_boolean_expressions - bbh_cot_fewshot_causal_judgement - bbh_cot_fewshot_date_understanding - bbh_cot_fewshot_disambiguation_qa - bbh_cot_fewshot_dyck_languages - bbh_cot_fewshot_formal_languages - bbh_cot_fewshot_geometric_shapes - bbh_cot_fewshot_hyperbaton - bbh_cot_fewshot_logical_deduction_five_objects - bbh_cot_fewshot_logical_deduction_seven_objects - bbh_cot_fewshot_logical_deduction_three_objects - bbh_cot_fewshot_movie_recommendation - bbh_cot_fewshot_multistep_arithmetic_two - bbh_cot_fewshot_navigate - bbh_cot_fewshot_object_counting - bbh_cot_fewshot_penguins_in_a_table - bbh_cot_fewshot_reasoning_about_colored_objects - bbh_cot_fewshot_ruin_names - bbh_cot_fewshot_salient_translation_error_detection - bbh_cot_fewshot_snarks - bbh_cot_fewshot_sports_understanding - bbh_cot_fewshot_temporal_sequences - bbh_cot_fewshot_tracking_shuffled_objects_five_objects - bbh_cot_fewshot_tracking_shuffled_objects_seven_objects - bbh_cot_fewshot_tracking_shuffled_objects_three_objects - bbh_cot_fewshot_web_of_lies - bbh_cot_fewshot_word_sorting aggregate_metric: - metric: exact_match aggregation: mean weight_by_size: true metadata: version: 2.0