mbpp_evalplus.yaml 1.3 KB
Newer Older
Baber's avatar
Baber committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
task: mbpp_evalplus
dataset_path: google-research-datasets/mbpp
dataset_name: full
unsafe_code: true
output_type: generate_until
test_split: test
repeats: 20
#doc_to_text: "You are an expert Python programmer, and here is your task: {{text}} Your code should pass these tests:\n\n{{test_list[0]}}\n{{test_list[1]}}\n{{test_list[2]}}\n[BEGIN]\n"
doc_to_text: |
  Please provide a self-contained Python script that solves the following problem in a markdown code block:
  ```
  {{text|trim}}
  {{test_list|random}}
  ```

doc_to_target: "{% if is_fewshot is defined %}{{code}}\n[DONE]{% else %}{{test_list[0]}}\n{{test_list[1]}}\n{{test_list[2]}}{% endif %}"
target_delimiter: ""
gen_prefix: "Here is the completed function:\n\n```python\n"
metric_list:
Baber's avatar
Baber committed
20
  - metric: !function utils.pass_at_k
Baber's avatar
Baber committed
21
22
    aggregation: mean
    higher_is_better: true
Baber's avatar
Baber committed
23
    k: [ 10 ]
Baber's avatar
Baber committed
24
25
26
27
28
29
30
filter_list:
  - name: "create_test"
    filter:
      - function: "custom"
        filter_fn: !function utils.build_predictions
generation_kwargs:
  until: [
Baber's avatar
Baber committed
31
32
33
34
35
36
37
38
39
40
    "\nclass",
    "\nassert",
    '\n"""',
    "\nprint",
    "\nif",
    "\n```",
    "\n#",
    "\n<|/",
    "<|eot_id|>",
  ]
Baber's avatar
Baber committed
41
42
43
  do_sample: true
  temperature: 0.8
  top_p: 0.95
Baber's avatar
Baber committed
44
  max_gen_toks: 512
Baber's avatar
Baber committed
45
46
47
48
49
50
num_fewshot: 0
fewshot_config:
  sampler: first_n
  samples: !function utils.list_fewshot_samples
metadata:
  version: 1.0