group: leaderboard_gpqa task: - leaderboard_gpqa_diamond - leaderboard_gpqa_extended - leaderboard_gpqa_main aggregate_metric_list: - metric: acc_norm aggregation: mean weight_by_size: true