Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
lm-evaluation-harness
Commits
6862fa7d
Unverified
Commit
6862fa7d
authored
Jul 14, 2023
by
Lintang Sutawika
Committed by
GitHub
Jul 14, 2023
Browse files
Merge pull request #676 from EleutherAI/new-flags
[Refactor] Miscellaneous fixes
parents
98c85d73
f7dde0c3
Changes
10
Hide whitespace changes
Inline
Side-by-side
Showing
10 changed files
with
90 additions
and
65 deletions
+90
-65
lm_eval/api/instance.py
lm_eval/api/instance.py
+7
-7
lm_eval/api/task.py
lm_eval/api/task.py
+12
-15
lm_eval/evaluator.py
lm_eval/evaluator.py
+34
-17
lm_eval/models/huggingface.py
lm_eval/models/huggingface.py
+2
-0
lm_eval/tasks/pubmedqa/preprocess_pubmedqa.py
lm_eval/tasks/pubmedqa/preprocess_pubmedqa.py
+4
-6
lm_eval/tasks/pubmedqa/pubmedqa.yaml
lm_eval/tasks/pubmedqa/pubmedqa.yaml
+1
-1
lm_eval/tasks/race/preprocess_race.py
lm_eval/tasks/race/preprocess_race.py
+8
-4
lm_eval/tasks/swag/swag.yaml
lm_eval/tasks/swag/swag.yaml
+1
-1
lm_eval/tasks/winogrande/preprocess_winogrande.py
lm_eval/tasks/winogrande/preprocess_winogrande.py
+4
-1
main.py
main.py
+17
-13
No files found.
lm_eval/api/instance.py
View file @
6862fa7d
...
@@ -4,13 +4,13 @@ from typing import Literal, Tuple
...
@@ -4,13 +4,13 @@ from typing import Literal, Tuple
@
dataclass
@
dataclass
class
Instance
:
class
Instance
:
request_type
:
str
=
Literal
[
request_type
:
Literal
[
"loglikelihood"
,
"loglikelihood_rolling"
,
"greedy_until"
]
"loglikelihood"
,
"loglikelihood_rolling"
,
"greedy_until"
doc
:
dict
]
arguments
:
tuple
doc
:
dict
=
None
idx
:
int
arguments
:
t
uple
=
None
metadata
:
T
uple
[
str
,
int
,
int
]
=
field
(
idx
:
int
=
None
default_factory
=
lambda
:
(
None
,
None
,
None
)
metadata
:
tuple
=
Tuple
[
str
,
int
,
int
]
# TODO: better typehints here
)
# TODO: better typehints here
resps
:
list
=
field
(
default_factory
=
list
)
resps
:
list
=
field
(
default_factory
=
list
)
filtered_resps
:
dict
=
field
(
default_factory
=
dict
)
filtered_resps
:
dict
=
field
(
default_factory
=
dict
)
...
...
lm_eval/api/task.py
View file @
6862fa7d
...
@@ -8,6 +8,7 @@ import evaluate
...
@@ -8,6 +8,7 @@ import evaluate
import
random
import
random
import
itertools
import
itertools
import
functools
import
functools
from
tqdm
import
tqdm
import
datasets
import
datasets
import
numpy
as
np
import
numpy
as
np
...
@@ -217,8 +218,8 @@ class Task(abc.ABC):
...
@@ -217,8 +218,8 @@ class Task(abc.ABC):
self
.
_filters
.
append
(
filter_pipeline
)
self
.
_filters
.
append
(
filter_pipeline
)
self
.
sampler
=
samplers
.
Sampler
(
self
.
sampler
=
samplers
.
Sampler
(
list
(
self
.
fewshot_docs
()),
self
,
rnd
=
random
.
Random
()
list
(
self
.
fewshot_docs
()),
self
,
rnd
=
random
.
Random
(
1234
)
)
# TODO: pass the correct docs in here
)
def
download
(
self
,
data_dir
=
None
,
cache_dir
=
None
,
download_mode
=
None
):
def
download
(
self
,
data_dir
=
None
,
cache_dir
=
None
,
download_mode
=
None
):
"""Downloads and returns the task dataset.
"""Downloads and returns the task dataset.
...
@@ -366,13 +367,18 @@ class Task(abc.ABC):
...
@@ -366,13 +367,18 @@ class Task(abc.ABC):
False
False
),
f
"Task dataset (path=
{
self
.
DATASET_PATH
}
, name=
{
self
.
DATASET_NAME
}
) must have valid or test docs!"
),
f
"Task dataset (path=
{
self
.
DATASET_PATH
}
, name=
{
self
.
DATASET_NAME
}
) must have valid or test docs!"
eval_logger
.
info
(
f
"Building contexts for task '
{
self
.
_config
.
task
}
' on rank
{
rank
}
..."
)
instances
=
[]
instances
=
[]
for
doc_id
,
doc
in
utils
.
create_iterator
(
for
doc_id
,
doc
in
utils
.
create_iterator
(
enumerate
(
docs
),
rank
,
world_size
,
limit
enumerate
(
docs
),
rank
,
world_size
,
limit
):
):
# sample fewshot context #TODO: need to offset doc_id by rank now!
# sample fewshot context #TODO: need to offset doc_id by rank now!
fewshot_ctx
=
self
.
fewshot_context
(
fewshot_ctx
=
self
.
fewshot_context
(
doc
,
self
.
_config
.
num_fewshot
,
rnd
=
random
.
Random
()
doc
,
self
.
_config
.
num_fewshot
,
)
)
# TODO: we should override self._config.repeats if doing greedy gen so users don't waste time+compute
# TODO: we should override self._config.repeats if doing greedy gen so users don't waste time+compute
...
@@ -453,7 +459,7 @@ class Task(abc.ABC):
...
@@ -453,7 +459,7 @@ class Task(abc.ABC):
return
len
(
re
.
split
(
r
"\s+"
,
doc
))
return
len
(
re
.
split
(
r
"\s+"
,
doc
))
@
utils
.
positional_deprecated
@
utils
.
positional_deprecated
def
fewshot_context
(
self
,
doc
,
num_fewshot
,
rnd
=
None
):
def
fewshot_context
(
self
,
doc
,
num_fewshot
):
"""Returns a fewshot context string that is made up of a prepended description
"""Returns a fewshot context string that is made up of a prepended description
(if provided), the `num_fewshot` number of examples, and an appended prompt example.
(if provided), the `num_fewshot` number of examples, and an appended prompt example.
...
@@ -461,15 +467,9 @@ class Task(abc.ABC):
...
@@ -461,15 +467,9 @@ class Task(abc.ABC):
The document as returned from training_docs, validation_docs, or test_docs.
The document as returned from training_docs, validation_docs, or test_docs.
:param num_fewshot: int
:param num_fewshot: int
The number of fewshot examples to provide in the returned context string.
The number of fewshot examples to provide in the returned context string.
:param rnd: random.Random
The pseudo-random number generator used to randomly sample examples.
WARNING: This is currently a required arg although it's optionalized with a default `None`.
:returns: str
:returns: str
The fewshot context.
The fewshot context.
"""
"""
assert
(
rnd
is
not
None
),
"A `random.Random` generator argument must be provided to `rnd`"
if
num_fewshot
==
0
:
if
num_fewshot
==
0
:
# always prepend the (possibly empty) task description
# always prepend the (possibly empty) task description
...
@@ -625,7 +625,7 @@ class ConfigurableTask(Task):
...
@@ -625,7 +625,7 @@ class ConfigurableTask(Task):
if
self
.
fewshot_docs
()
is
not
None
:
if
self
.
fewshot_docs
()
is
not
None
:
self
.
sampler
=
samplers
.
Sampler
(
self
.
sampler
=
samplers
.
Sampler
(
list
(
self
.
fewshot_docs
()),
self
,
rnd
=
random
.
Random
()
list
(
self
.
fewshot_docs
()),
self
,
rnd
=
random
.
Random
(
1234
)
)
)
def
download
(
self
,
dataset_kwargs
=
None
):
def
download
(
self
,
dataset_kwargs
=
None
):
...
@@ -1004,13 +1004,10 @@ class PerplexityTask(Task):
...
@@ -1004,13 +1004,10 @@ class PerplexityTask(Task):
assert
k
==
0
assert
k
==
0
return
[]
return
[]
def
fewshot_context
(
self
,
doc
,
num_fewshot
,
rnd
=
None
):
def
fewshot_context
(
self
,
doc
,
num_fewshot
):
assert
(
assert
(
num_fewshot
==
0
num_fewshot
==
0
),
"The number of fewshot examples must be 0 for perplexity tasks."
),
"The number of fewshot examples must be 0 for perplexity tasks."
assert
(
rnd
is
not
None
),
"A `random.Random` generator argument must be provided to `rnd`."
return
""
return
""
...
...
lm_eval/evaluator.py
View file @
6862fa7d
...
@@ -45,6 +45,7 @@ def simple_evaluate(
...
@@ -45,6 +45,7 @@ def simple_evaluate(
check_integrity
=
False
,
check_integrity
=
False
,
decontamination_ngrams_path
=
None
,
decontamination_ngrams_path
=
None
,
write_out
=
False
,
write_out
=
False
,
log_samples
=
True
,
):
):
"""Instantiate and evaluate a model on a list of tasks.
"""Instantiate and evaluate a model on a list of tasks.
...
@@ -72,12 +73,17 @@ def simple_evaluate(
...
@@ -72,12 +73,17 @@ def simple_evaluate(
:param check_integrity: bool
:param check_integrity: bool
Whether to run the relevant part of the test suite for the tasks
Whether to run the relevant part of the test suite for the tasks
:param write_out: bool
:param write_out: bool
If True, write details about prompts and logits to json for all tasks
If True, write out an example document and model input for checking task integrity
:param log_samples: bool
If True, write out all model outputs and documents for per-sample measurement and post-hoc analysis
:return
:return
Dictionary of results
Dictionary of results
"""
"""
random
.
seed
(
1234
)
random
.
seed
(
0
)
np
.
random
.
seed
(
1234
)
np
.
random
.
seed
(
1234
)
torch
.
manual_seed
(
1234
)
# TODO: this may affect training runs that are run with evaluation mid-run.
assert
tasks
!=
[],
"No tasks specified"
assert
tasks
!=
[],
"No tasks specified"
...
@@ -118,6 +124,7 @@ def simple_evaluate(
...
@@ -118,6 +124,7 @@ def simple_evaluate(
bootstrap_iters
=
bootstrap_iters
,
bootstrap_iters
=
bootstrap_iters
,
decontamination_ngrams_path
=
decontamination_ngrams_path
,
decontamination_ngrams_path
=
decontamination_ngrams_path
,
write_out
=
write_out
,
write_out
=
write_out
,
log_samples
=
log_samples
,
)
)
if
lm
.
rank
==
0
:
if
lm
.
rank
==
0
:
...
@@ -154,6 +161,7 @@ def evaluate(
...
@@ -154,6 +161,7 @@ def evaluate(
bootstrap_iters
=
100000
,
bootstrap_iters
=
100000
,
decontamination_ngrams_path
=
None
,
decontamination_ngrams_path
=
None
,
write_out
=
False
,
write_out
=
False
,
log_samples
=
True
,
):
):
"""Instantiate and evaluate a model on a list of tasks.
"""Instantiate and evaluate a model on a list of tasks.
...
@@ -168,7 +176,9 @@ def evaluate(
...
@@ -168,7 +176,9 @@ def evaluate(
:param bootstrap_iters:
:param bootstrap_iters:
Number of iterations for bootstrap statistics
Number of iterations for bootstrap statistics
:param write_out: bool
:param write_out: bool
If True, write all prompts, logits and metrics to json for offline analysis
If True, write out an example document and model input for checking task integrity
:param log_samples: bool
If True, write out all model outputs and documents for per-sample measurement and post-hoc analysis
:return
:return
Dictionary of results
Dictionary of results
"""
"""
...
@@ -213,7 +223,10 @@ def evaluate(
...
@@ -213,7 +223,10 @@ def evaluate(
# aggregate Instances by LM method requested to get output.
# aggregate Instances by LM method requested to get output.
reqtype
=
(
reqtype
=
(
"loglikelihood"
"loglikelihood"
if
(
task
.
OUTPUT_TYPE
==
"multiple_choice"
or
task
.
OUTPUT_TYPE
==
"winograd_schema"
)
if
(
task
.
OUTPUT_TYPE
==
"multiple_choice"
or
task
.
OUTPUT_TYPE
==
"winograd_schema"
)
else
task
.
OUTPUT_TYPE
else
task
.
OUTPUT_TYPE
)
# TODO: this is hacky, fix in task.py
)
# TODO: this is hacky, fix in task.py
requests
[
reqtype
].
extend
(
task
.
instances
)
requests
[
reqtype
].
extend
(
task
.
instances
)
...
@@ -279,17 +292,18 @@ def evaluate(
...
@@ -279,17 +292,18 @@ def evaluate(
metrics
=
task
.
process_results
(
metrics
=
task
.
process_results
(
doc
,
[
req
.
filtered_resps
[
key
]
for
req
in
requests
]
doc
,
[
req
.
filtered_resps
[
key
]
for
req
in
requests
]
)
)
target
=
task
.
doc_to_target
(
doc
)
if
log_samples
:
example
=
{
target
=
task
.
doc_to_target
(
doc
)
"doc_id"
:
doc_id
,
example
=
{
"doc"
:
doc
,
"doc_id"
:
doc_id
,
"target"
:
target
,
"doc"
:
doc
,
"arguments"
:
requests
[
0
].
args
,
"target"
:
target
,
"resps"
:
[
req
.
resps
for
req
in
requests
],
"arguments"
:
[
req
.
args
for
req
in
requests
],
"filtered_resps"
:
[
req
.
filtered_resps
[
key
]
for
req
in
requests
],
"resps"
:
[
req
.
resps
for
req
in
requests
],
}
"filtered_resps"
:
[
req
.
filtered_resps
[
key
]
for
req
in
requests
],
example
.
update
(
metrics
)
}
samples
[
task_name
].
append
(
example
)
example
.
update
(
metrics
)
samples
[
task_name
].
append
(
example
)
for
metric
,
value
in
metrics
.
items
():
for
metric
,
value
in
metrics
.
items
():
vals
[(
task_name
,
key
,
metric
)].
append
(
value
)
vals
[(
task_name
,
key
,
metric
)].
append
(
value
)
...
@@ -359,12 +373,15 @@ def evaluate(
...
@@ -359,12 +373,15 @@ def evaluate(
if
stderr
is
not
None
:
if
stderr
is
not
None
:
results
[
task_name
][
metric
+
"_stderr"
+
","
+
key
]
=
stderr
(
items
)
results
[
task_name
][
metric
+
"_stderr"
+
","
+
key
]
=
stderr
(
items
)
re
turn
{
re
sults_dict
=
{
"results"
:
dict
(
results
),
"results"
:
dict
(
results
),
"configs"
:
dict
(
configs
),
"configs"
:
dict
(
configs
),
"versions"
:
dict
(
versions
),
"versions"
:
dict
(
versions
),
"samples"
:
samples
,
}
}
if
log_samples
:
results_dict
[
"samples"
]
=
dict
(
samples
)
return
results_dict
else
:
else
:
return
None
return
None
lm_eval/models/huggingface.py
View file @
6862fa7d
...
@@ -70,6 +70,7 @@ class HFLM(LM):
...
@@ -70,6 +70,7 @@ class HFLM(LM):
batch_size
:
Optional
[
int
]
=
1
,
batch_size
:
Optional
[
int
]
=
1
,
low_cpu_mem_usage
:
Optional
[
bool
]
=
True
,
low_cpu_mem_usage
:
Optional
[
bool
]
=
True
,
trust_remote_code
:
Optional
[
bool
]
=
False
,
trust_remote_code
:
Optional
[
bool
]
=
False
,
use_fast_tokenizer
:
Optional
[
bool
]
=
True
,
# arguments used for splitting a model across GPUs naively.
# arguments used for splitting a model across GPUs naively.
# only used if `parallelize=True`.
# only used if `parallelize=True`.
parallelize
:
Optional
[
bool
]
=
False
,
parallelize
:
Optional
[
bool
]
=
False
,
...
@@ -216,6 +217,7 @@ class HFLM(LM):
...
@@ -216,6 +217,7 @@ class HFLM(LM):
pretrained
if
tokenizer
is
None
else
tokenizer
,
pretrained
if
tokenizer
is
None
else
tokenizer
,
revision
=
revision
,
revision
=
revision
,
trust_remote_code
=
trust_remote_code
,
trust_remote_code
=
trust_remote_code
,
use_fast
=
use_fast_tokenizer
,
)
)
self
.
vocab_size
=
self
.
tokenizer
.
vocab_size
self
.
vocab_size
=
self
.
tokenizer
.
vocab_size
...
...
lm_eval/tasks/pubmedqa/preprocess_pubmedqa.py
View file @
6862fa7d
...
@@ -4,13 +4,11 @@ def doc_to_text(doc):
...
@@ -4,13 +4,11 @@ def doc_to_text(doc):
ctxs
,
doc
[
"question"
],
doc
[
"final_decision"
]
ctxs
,
doc
[
"question"
],
doc
[
"final_decision"
]
)
)
def
doc_to_target
(
doc
):
def
doc_to_target
(
doc
):
return
" {}"
.
format
(
doc
[
"final_decision"
])
return
" {}"
.
format
(
doc
[
"final_decision"
])
def
gold_alias
(
doc
):
def
gold_alias
(
doc
):
dict_to_label
=
{
dict_to_label
=
{
"yes"
:
0
,
"no"
:
1
,
"maybe"
:
2
}
'yes'
:
0
,
return
dict_to_label
[
doc
[
"final_decision"
]]
'no'
:
1
,
'maybe'
:
2
}
return
dict_to_label
[
doc
[
"final_decision"
]]
\ No newline at end of file
lm_eval/tasks/pubmedqa/pubmedqa.yaml
View file @
6862fa7d
...
@@ -14,4 +14,4 @@ gold_alias: !function preprocess_pubmedqa.gold_alias
...
@@ -14,4 +14,4 @@ gold_alias: !function preprocess_pubmedqa.gold_alias
metric_list
:
metric_list
:
-
metric
:
acc
-
metric
:
acc
aggregation
:
mean
aggregation
:
mean
higher_is_better
:
true
higher_is_better
:
true
\ No newline at end of file
lm_eval/tasks/race/preprocess_race.py
View file @
6862fa7d
import
ast
import
ast
def
process_ast
(
string
):
def
process_ast
(
string
):
return
ast
.
literal_eval
(
string
)
return
ast
.
literal_eval
(
string
)
def
last_problem
(
doc
):
def
last_problem
(
doc
):
return
process_ast
(
doc
[
"problems"
])[
-
1
]
return
process_ast
(
doc
[
"problems"
])[
-
1
]
def
get_answer_option
(
problem
):
def
get_answer_option
(
problem
):
letter_to_num
=
{
"A"
:
0
,
"B"
:
1
,
"C"
:
2
,
"D"
:
3
}
letter_to_num
=
{
"A"
:
0
,
"B"
:
1
,
"C"
:
2
,
"D"
:
3
}
answer
=
letter_to_num
[
problem
[
"answer"
]]
answer
=
letter_to_num
[
problem
[
"answer"
]]
return
problem
[
"options"
][
answer
]
return
problem
[
"options"
][
answer
]
def
create_choices
(
doc
):
def
create_choices
(
doc
):
problem
=
last_problem
(
doc
)
problem
=
last_problem
(
doc
)
choices
=
[
problem
[
"options"
][
i
]
for
i
in
range
(
4
)]
choices
=
[
problem
[
"options"
][
i
]
for
i
in
range
(
4
)]
return
choices
return
choices
def
doc_to_text
(
doc
):
def
doc_to_text
(
doc
):
text
=
"Article: "
+
doc
[
"article"
]
+
"
\n\n
"
text
=
"Article: "
+
doc
[
"article"
]
+
"
\n\n
"
for
problem
in
process_ast
(
doc
[
"problems"
])[:
-
1
]:
for
problem
in
process_ast
(
doc
[
"problems"
])[:
-
1
]:
if
problem
[
"question"
][
-
6
:]
==
" _ ."
:
if
problem
[
"question"
][
-
6
:]
==
" _ ."
:
text
+=
(
text
+=
problem
[
"question"
][
-
5
:]
+
get_answer_option
(
problem
)
+
"
\n
"
problem
[
"question"
][
-
5
:]
+
get_answer_option
(
problem
)
+
"
\n
"
)
else
:
else
:
question
=
"Question: "
+
problem
[
"question"
]
+
"
\n
"
question
=
"Question: "
+
problem
[
"question"
]
+
"
\n
"
answer
=
"Answer: "
+
get_answer_option
(
problem
)
+
"
\n
"
answer
=
"Answer: "
+
get_answer_option
(
problem
)
+
"
\n
"
...
@@ -30,6 +33,7 @@ def doc_to_text(doc):
...
@@ -30,6 +33,7 @@ def doc_to_text(doc):
text
+=
last_problem
(
doc
)[
"question"
]
text
+=
last_problem
(
doc
)[
"question"
]
return
text
return
text
def
doc_to_target
(
doc
):
def
doc_to_target
(
doc
):
letter_to_num
=
{
"A"
:
0
,
"B"
:
1
,
"C"
:
2
,
"D"
:
3
}
letter_to_num
=
{
"A"
:
0
,
"B"
:
1
,
"C"
:
2
,
"D"
:
3
}
answer
=
letter_to_num
[
last_problem
(
doc
)[
"answer"
]]
answer
=
letter_to_num
[
last_problem
(
doc
)[
"answer"
]]
...
...
lm_eval/tasks/swag/swag.yaml
View file @
6862fa7d
...
@@ -17,4 +17,4 @@ metric_list:
...
@@ -17,4 +17,4 @@ metric_list:
higher_is_better
:
true
higher_is_better
:
true
-
metric
:
acc_norm
-
metric
:
acc_norm
aggregation
:
mean
aggregation
:
mean
higher_is_better
:
true
higher_is_better
:
true
\ No newline at end of file
lm_eval/tasks/winogrande/preprocess_winogrande.py
View file @
6862fa7d
...
@@ -4,11 +4,13 @@ def partial_context(doc, option):
...
@@ -4,11 +4,13 @@ def partial_context(doc, option):
pronoun_loc
=
doc
[
"sentence"
].
index
(
"_"
)
pronoun_loc
=
doc
[
"sentence"
].
index
(
"_"
)
return
doc
[
"sentence"
][:
pronoun_loc
]
+
option
return
doc
[
"sentence"
][:
pronoun_loc
]
+
option
def
partial_target
(
doc
):
def
partial_target
(
doc
):
# The target is everything after the document specified pronoun.
# The target is everything after the document specified pronoun.
pronoun_loc
=
doc
[
"sentence"
].
index
(
"_"
)
+
1
pronoun_loc
=
doc
[
"sentence"
].
index
(
"_"
)
+
1
return
doc
[
"sentence"
][
pronoun_loc
:].
strip
()
return
doc
[
"sentence"
][
pronoun_loc
:].
strip
()
def
create_choices
(
doc
):
def
create_choices
(
doc
):
choices
=
[]
choices
=
[]
for
option
in
[
doc
[
"option1"
],
doc
[
"option2"
]]:
for
option
in
[
doc
[
"option1"
],
doc
[
"option2"
]]:
...
@@ -16,6 +18,7 @@ def create_choices(doc):
...
@@ -16,6 +18,7 @@ def create_choices(doc):
choices
.
append
(
partial_ctx
)
choices
.
append
(
partial_ctx
)
return
choices
return
choices
def
gold_alias
(
doc
):
def
gold_alias
(
doc
):
answer_to_num
=
{
"1"
:
0
,
"2"
:
1
}
answer_to_num
=
{
"1"
:
0
,
"2"
:
1
}
return
answer_to_num
[
doc
[
'answer'
]]
return
answer_to_num
[
doc
[
"answer"
]]
\ No newline at end of file
main.py
View file @
6862fa7d
...
@@ -43,6 +43,7 @@ def parse_args():
...
@@ -43,6 +43,7 @@ def parse_args():
parser
.
add_argument
(
"--decontamination_ngrams_path"
,
default
=
None
)
parser
.
add_argument
(
"--decontamination_ngrams_path"
,
default
=
None
)
parser
.
add_argument
(
"--check_integrity"
,
action
=
"store_true"
)
parser
.
add_argument
(
"--check_integrity"
,
action
=
"store_true"
)
parser
.
add_argument
(
"--write_out"
,
action
=
"store_true"
,
default
=
False
)
parser
.
add_argument
(
"--write_out"
,
action
=
"store_true"
,
default
=
False
)
parser
.
add_argument
(
"--log_samples"
,
action
=
"store_true"
,
default
=
True
)
return
parser
.
parse_args
()
return
parser
.
parse_args
()
...
@@ -89,10 +90,12 @@ def main():
...
@@ -89,10 +90,12 @@ def main():
decontamination_ngrams_path
=
args
.
decontamination_ngrams_path
,
decontamination_ngrams_path
=
args
.
decontamination_ngrams_path
,
check_integrity
=
args
.
check_integrity
,
check_integrity
=
args
.
check_integrity
,
write_out
=
args
.
write_out
,
write_out
=
args
.
write_out
,
log_samples
=
args
.
log_samples
,
)
)
if
results
is
not
None
:
if
results
is
not
None
:
samples
=
results
.
pop
(
"samples"
)
if
args
.
log_samples
:
samples
=
results
.
pop
(
"samples"
)
dumped
=
json
.
dumps
(
results
,
indent
=
2
,
default
=
lambda
o
:
str
(
o
))
dumped
=
json
.
dumps
(
results
,
indent
=
2
,
default
=
lambda
o
:
str
(
o
))
print
(
dumped
)
print
(
dumped
)
...
@@ -104,19 +107,20 @@ def main():
...
@@ -104,19 +107,20 @@ def main():
with
open
(
args
.
output_path
,
"w"
)
as
f
:
with
open
(
args
.
output_path
,
"w"
)
as
f
:
f
.
write
(
dumped
)
f
.
write
(
dumped
)
for
task_name
,
config
in
results
[
"configs"
].
items
():
if
args
.
log_samples
:
output_name
=
"{}_{}"
.
format
(
for
task_name
,
config
in
results
[
"configs"
].
items
():
re
.
sub
(
"/"
,
"__"
,
args
.
model_args
),
task_name
output_name
=
"{}_{}"
.
format
(
)
re
.
sub
(
"/"
,
"__"
,
args
.
model_args
),
task_name
if
os
.
path
.
isdir
(
args
.
output_path
):
filename
=
f
"./
{
args
.
output_path
}
/
{
output_name
}
.jsonl"
elif
os
.
path
.
isfile
(
args
.
output_path
):
filename
=
(
f
"./
{
os
.
path
.
dirname
(
args
.
output_path
)
}
/
{
output_name
}
.jsonl"
)
)
if
os
.
path
.
isdir
(
args
.
output_path
):
with
jsonlines
.
open
(
filename
,
"w"
)
as
f
:
filename
=
f
"./
{
args
.
output_path
}
/
{
output_name
}
.jsonl"
f
.
write_all
(
samples
[
task_name
])
elif
os
.
path
.
isfile
(
args
.
output_path
):
filename
=
(
f
"./
{
os
.
path
.
dirname
(
args
.
output_path
)
}
/
{
output_name
}
.jsonl"
)
with
jsonlines
.
open
(
filename
,
"w"
)
as
f
:
f
.
write_all
(
samples
[
task_name
])
print
(
print
(
f
"
{
args
.
model
}
(
{
args
.
model_args
}
), limit:
{
args
.
limit
}
, num_fewshot:
{
args
.
num_fewshot
}
, "
f
"
{
args
.
model
}
(
{
args
.
model_args
}
), limit:
{
args
.
limit
}
, num_fewshot:
{
args
.
num_fewshot
}
, "
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment