test-01.yaml 1.25 KB
Newer Older
lintangsutawika's avatar
lintangsutawika committed
1
2
3
4
5
group: test-1
group_alias: test 1
task:
  - piqa # string task
  - ai2_arc # string tag
6
7
  # - task: super-glue-lm-eval-v1 # Should this be spread out?
  #   num_fewshot: 3
lintangsutawika's avatar
lintangsutawika committed
8
9
  - task: swag # dict registered task
    num_fewshot: 2
10
11
  # - task: mmlu
  #   num_fewshot: 5
lintangsutawika's avatar
lintangsutawika committed
12
13
14
15
16
17
18
19
  - group: nli-tasks # dict group
    task:
      - anli
      - boolq
      - sglue_rte
    num_fewshot: 4
    metric_list:
      - metric: brier_score
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
    aggregate_metric: true
    
  # - task: sciq # dict registered task duplicate
  #   task_alias: sciq 2-shot
  #   num_fewshot: 2
  # - task: sciq # dict registered task duplicate
  #   task_alias: sciq 4-shot
  #   num_fewshot: 4
  # - task: sciq # dict registered task duplicate
  #   task_alias: sciq 6-shot
  #   num_fewshot: 6
  # - task: siqa_custom # dict task
  #   dataset_path: social_i_qa
  #   dataset_name: null
  #   output_type: multiple_choice
  #   training_split: train
  #   validation_split: validation
  #   doc_to_text: "Question: {{context}} {{question}}\nAnswer:"
  #   target_delimiter: " "
  #   doc_to_choice:
  #     - "{{answerA}}"
  #     - "{{answerB}}"
  #     - "{{answerC}}"
  #   doc_to_target: "{{ (label|int) - 1 }}"
  #   metric_list:
  #     - metric: acc
  #       aggregation: mean
  #       higher_is_better: true