Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
ModelZoo
FlagEmbedding_pytorch
Commits
f75058c7
Commit
f75058c7
authored
Jul 25, 2024
by
Rayyyyy
Browse files
First add.
parents
Pipeline
#1411
canceled with stages
Changes
303
Pipelines
1
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
604 additions
and
0 deletions
+604
-0
Long_LLM/activation_beacon/old/main/eval_lm.py
Long_LLM/activation_beacon/old/main/eval_lm.py
+178
-0
Long_LLM/activation_beacon/old/main/eval_longbench.py
Long_LLM/activation_beacon/old/main/eval_longbench.py
+264
-0
Long_LLM/activation_beacon/old/main/eval_longeval.py
Long_LLM/activation_beacon/old/main/eval_longeval.py
+162
-0
No files found.
Too many changes to show.
To preserve performance only
303 of 303+
files are displayed.
Plain diff
Email patch
Long_LLM/activation_beacon/old/main/eval_lm.py
0 → 100644
View file @
f75058c7
import
os
import
datasets
import
time
import
torch
from
datetime
import
timedelta
from
typing
import
Optional
from
collections
import
defaultdict
from
dataclasses
import
dataclass
,
field
,
asdict
from
accelerate
import
Accelerator
,
InitProcessGroupKwargs
from
transformers
import
HfArgumentParser
from
torch.utils.data
import
DataLoader
from
src
import
ModelArgs
,
DatasetProcessFn
,
DefaultDataCollator
,
FileLogger
,
get_model_and_tokenizer
,
makedirs
,
split_file_dir_name_ext
,
evaluate_perplexity
@
dataclass
class
Args
(
ModelArgs
):
eval_data
:
str
=
field
(
default
=
"activation-beacon:lm/pg19.json"
,
metadata
=
{
'help'
:
'The evaluation json data path.'
}
)
output_dir
:
str
=
field
(
default
=
"data/results/lm/"
,
metadata
=
{
'help'
:
'Output directory for results and logs.'
}
)
retokenize
:
bool
=
field
(
default
=
False
,
metadata
=
{
'help'
:
'Retokenize the corpus?'
}
)
tokenize_max_char
:
Optional
[
int
]
=
field
(
default
=
None
,
metadata
=
{
'help'
:
'The number of chars to truncate.'
}
)
batch_size
:
int
=
field
(
default
=
1
,
metadata
=
{
'help'
:
'Evaluation batch size.'
}
)
padding_side
:
str
=
field
(
default
=
"right"
,
metadata
=
{
'help'
:
'Which side to pad?'
}
)
stride
:
int
=
field
(
default
=
2048
,
metadata
=
{
'help'
:
'Streaming stride when evaluating perplexity.'
}
)
max_sample_num
:
int
=
field
(
default
=
100
,
metadata
=
{
'help'
:
'How many samples to evaluate in eval_data?'
}
)
min_length
:
Optional
[
int
]
=
field
(
default
=
None
,
metadata
=
{
'help'
:
'Minimum length for input_ids.'
}
)
def
process_lm_pre
(
tokenizer
,
tokenize_max_char
=
None
):
@
DatasetProcessFn
()
def
_process
(
text
,
**
kwds
):
if
tokenize_max_char
is
not
None
:
text
=
text
[:
tokenize_max_char
]
output
=
{
"input_ids"
:
tokenizer
.
encode
(
text
,
add_special_tokens
=
False
)}
return
output
return
_process
def
process_lm
(
tokenizer
,
max_length
=
4096
,
stride
=
1024
,
min_length
=
None
):
# stride=0 indicates we just use one forward pass with max_length for each text
if
stride
==
0
:
stride
=
max_length
jump
=
True
else
:
jump
=
False
test
=
tokenizer
.
encode
(
"test"
)
has_bos
=
False
if
test
[
0
]
==
tokenizer
.
bos_token_id
:
# NOTE: subtract 1 because it will be occupied by the bos token
max_length
-=
1
has_bos
=
True
@
DatasetProcessFn
(
augment
=
True
)
def
_process
(
input_ids
,
_index
,
**
kwds
):
outputs
=
defaultdict
(
list
)
seq_len
=
len
(
input_ids
)
prev_end_loc
=
0
if
min_length
is
not
None
and
seq_len
<
min_length
:
return
for
start_loc
in
range
(
0
,
seq_len
,
stride
):
end_loc
=
min
(
start_loc
+
max_length
,
seq_len
)
sub_seq_len
=
end_loc
-
start_loc
sub_trg_len
=
end_loc
-
prev_end_loc
# may be different from stride on last loop
sub_input_ids
=
input_ids
[
start_loc
:
end_loc
]
sub_attention_mask
=
[
1
for
_
in
range
(
sub_seq_len
)]
if
has_bos
:
sub_input_ids
.
insert
(
0
,
tokenizer
.
bos_token_id
)
sub_attention_mask
.
insert
(
0
,
1
)
sub_seq_len
+=
1
sub_labels
=
sub_input_ids
.
copy
()
sub_labels
[:
-
sub_trg_len
]
=
[
-
100
for
_
in
range
(
sub_seq_len
-
sub_trg_len
)]
sub_inputs
=
{
"index"
:
_index
,
"input_ids"
:
sub_input_ids
,
"attention_mask"
:
sub_attention_mask
,
"labels"
:
sub_labels
,
}
for
k
,
v
in
sub_inputs
.
items
():
outputs
[
k
].
append
(
v
)
prev_end_loc
=
end_loc
# NOTE: when end_loc is just the same as seq_len, jump out
if
end_loc
==
seq_len
or
jump
:
break
return
outputs
return
_process
def
main
():
parser
=
HfArgumentParser
([
Args
])
args
:
Args
=
parser
.
parse_args_into_dataclasses
()[
0
]
# increase timeout to avoid error
accelerator
=
Accelerator
(
cpu
=
args
.
cpu
,
kwargs_handlers
=
[
InitProcessGroupKwargs
(
timeout
=
timedelta
(
seconds
=
100000
))])
model
,
tokenizer
=
get_model_and_tokenizer
(
args
,
accelerator
=
accelerator
)
_
,
dataset_name
,
_
=
split_file_dir_name_ext
(
args
.
eval_data
)
tokenized_dataset_path
=
os
.
path
.
join
(
args
.
output_dir
,
dataset_name
,
"tokenized_inputs"
)
with
accelerator
.
main_process_first
():
if
not
os
.
path
.
exists
(
tokenized_dataset_path
)
or
args
.
retokenize
:
pre_process_fn
=
process_lm_pre
(
tokenizer
=
tokenizer
,
tokenize_max_char
=
args
.
tokenize_max_char
)
raw_dataset
=
datasets
.
load_dataset
(
"json"
,
data_files
=
args
.
eval_data
,
cache_dir
=
args
.
dataset_cache_dir
,
split
=
"train"
)
tokenized_dataset
=
raw_dataset
.
map
(
pre_process_fn
,
batched
=
True
,
num_proc
=
32
,
remove_columns
=
raw_dataset
.
column_names
,
batch_size
=
32
)
tokenized_dataset
.
save_to_disk
(
tokenized_dataset_path
)
tokenized_dataset
=
datasets
.
load_from_disk
(
tokenized_dataset_path
)
process_fn
=
process_lm
(
tokenizer
,
max_length
=
args
.
max_length
,
stride
=
args
.
stride
,
min_length
=
args
.
min_length
)
if
len
(
tokenized_dataset
)
>
args
.
max_sample_num
:
# slice out the first max_sample_num samples
tokenized_dataset
=
tokenized_dataset
.
train_test_split
(
args
.
max_sample_num
,
shuffle
=
False
)[
"test"
]
dataset
=
tokenized_dataset
.
map
(
process_fn
,
batched
=
True
,
num_proc
=
32
,
remove_columns
=
tokenized_dataset
.
column_names
,
keep_in_memory
=
True
,
with_indices
=
True
)
data_collator
=
DefaultDataCollator
(
tokenizer
=
tokenizer
)
dataloader
=
DataLoader
(
dataset
,
batch_size
=
args
.
batch_size
,
collate_fn
=
data_collator
,
# only pin memory when no gpu
pin_memory
=
not
args
.
cpu
,
)
t1
=
time
.
time
()
perplexity
=
evaluate_perplexity
(
model
,
dataloader
,
accelerator
)
t2
=
time
.
time
()
memory
=
torch
.
cuda
.
max_memory_allocated
()
/
1024
**
2
metrics
=
{
"perplexity"
:
perplexity
,
"time"
:
round
((
t2
-
t1
)
/
len
(
dataset
),
4
),
"memory"
:
memory
}
if
accelerator
.
process_index
==
0
:
log_path
=
os
.
path
.
join
(
args
.
output_dir
,
f
"
{
dataset_name
}
.log"
)
file_logger
=
FileLogger
(
makedirs
(
log_path
))
file_logger
.
log
(
metrics
,
Args
=
asdict
(
args
))
if
__name__
==
"__main__"
:
main
()
Long_LLM/activation_beacon/old/main/eval_longbench.py
0 → 100644
View file @
f75058c7
import
os
import
datasets
import
json
import
torch
from
tqdm
import
tqdm
from
typing
import
Optional
,
Dict
,
List
from
collections
import
defaultdict
from
dataclasses
import
dataclass
,
field
,
asdict
from
accelerate
import
Accelerator
from
transformers
import
HfArgumentParser
from
transformers.utils
import
logging
from
torch.utils.data
import
DataLoader
from
src
import
ModelArgs
,
DatasetProcessFn
,
DefaultDataCollator
,
FileLogger
,
get_model_and_tokenizer
,
makedirs
from
.longbench_utils
import
DATASET2PROMPT
,
DATASET2MAXNEWTOKENS
,
DATASET2CATEGORY
,
scorer
,
scorer_e
logger
=
logging
.
get_logger
(
__name__
)
@
dataclass
class
Args
(
ModelArgs
):
eval_data
:
str
=
field
(
default
=
"activation-beacon:longbench/test.json"
,
metadata
=
{
'help'
:
'The evaluation json data path.'
}
)
output_dir
:
str
=
field
(
default
=
"data/results/longbench/"
,
metadata
=
{
'help'
:
'Output directory for results and logs.'
}
)
batch_size
:
int
=
field
(
default
=
1
,
metadata
=
{
'help'
:
'Evaluation batch size.'
}
)
dataset_names
:
List
[
str
]
=
field
(
default_factory
=
lambda
:
[
'narrativeqa'
,
'qasper'
,
'multifieldqa_en'
,
'hotpotqa'
,
'2wikimqa'
,
'musique'
,
'gov_report'
,
'qmsum'
,
'multi_news'
,
'trec'
,
'triviaqa'
,
'samsum'
,
'lcc'
,
'repobench-p'
],
metadata
=
{
'help'
:
'Which dataset to evaluate?'
}
)
model_name_or_path
:
str
=
field
(
default
=
"meta-llama/Llama-2-7b-chat-hf"
,
metadata
=
{
'help'
:
'Model name on huggingface.'
}
)
max_length
:
int
=
field
(
default
=
3500
,
metadata
=
{
'help'
:
'Max input length.'
}
)
truncate_from_middle
:
bool
=
field
(
default
=
True
,
metadata
=
{
'help'
:
'Truncate inputs from the middle.'
}
)
load_result
:
bool
=
field
(
default
=
False
,
metadata
=
{
'help'
:
'Load result from saved files?'
}
)
def
process_longbench
(
tokenizer
,
prompt_templates
:
Optional
[
Dict
]
=
None
,
max_length
=
3500
,
add_chat_inst
=
False
,
truncate_from_middle
=
True
):
@
DatasetProcessFn
()
def
_process
(
input
:
str
,
context
:
str
,
dataset
:
str
,
all_classes
:
Optional
[
List
],
answers
:
List
[
str
],
length
:
int
,
_index
:
int
,
**
kwds
):
output
=
{}
if
dataset
.
endswith
(
"_e"
):
dataset
=
dataset
[:
-
2
]
prompt_template
=
prompt_templates
[
dataset
]
prompt
=
prompt_template
.
format
(
input
=
input
,
context
=
context
)
if
truncate_from_middle
:
tokenized_prompt
=
tokenizer
.
encode
(
prompt
)
if
len
(
tokenized_prompt
)
>
max_length
:
half
=
int
(
max_length
/
2
)
prompt
=
tokenizer
.
decode
(
tokenized_prompt
[:
half
],
skip_special_tokens
=
True
)
+
tokenizer
.
decode
(
tokenized_prompt
[
-
half
:],
skip_special_tokens
=
True
)
else
:
tokenized_prompt
=
tokenizer
.
encode
(
prompt
)
prompt
=
tokenizer
.
decode
(
tokenized_prompt
[
-
max_length
:],
skip_special_tokens
=
True
)
# chat models are better off without build prompts on these tasks
if
add_chat_inst
:
prompt
=
f
"[INST]
{
prompt
}
[/INST]"
output
=
tokenizer
(
prompt
,
padding
=
False
,
truncation
=
False
)
output
[
"dataset"
]
=
dataset
output
[
"idx"
]
=
_index
return
output
return
_process
@
torch
.
no_grad
()
def
main
():
parser
=
HfArgumentParser
([
Args
])
args
=
parser
.
parse_args_into_dataclasses
()[
0
]
if
".e."
in
args
.
eval_data
:
args
.
output_dir
=
args
.
output_dir
.
replace
(
"longbench"
,
"longbench_e"
)
else
:
args
.
output_dir
=
args
.
output_dir
.
replace
(
"longbench_e"
,
"longbench"
)
result_dir_components
=
[
args
.
output_dir
,
args
.
model_name_or_path
.
strip
(
os
.
sep
).
replace
(
os
.
sep
,
"--"
),
str
(
args
.
max_length
)]
result_dir
=
os
.
path
.
join
(
*
result_dir_components
)
accelerator
=
Accelerator
(
cpu
=
args
.
cpu
)
model
,
tokenizer
=
get_model_and_tokenizer
(
args
,
accelerator
=
accelerator
)
with
accelerator
.
main_process_first
():
process_fn
=
process_longbench
(
tokenizer
,
max_length
=
args
.
max_length
,
prompt_templates
=
DATASET2PROMPT
,
add_chat_inst
=
args
.
add_chat_inst
,
truncate_from_middle
=
args
.
truncate_from_middle
,
)
raw_dataset
=
datasets
.
load_dataset
(
"json"
,
data_files
=
args
.
eval_data
,
cache_dir
=
args
.
dataset_cache_dir
,
split
=
"train"
)
dataset
=
raw_dataset
.
map
(
process_fn
,
batched
=
True
,
num_proc
=
32
,
with_indices
=
True
,
remove_columns
=
raw_dataset
.
column_names
)
groupby_dataset
=
dataset
.
to_pandas
().
groupby
(
"dataset"
)
metrics
=
{}
if
args
.
dataset_names
is
None
:
dataset_names
=
[
key
for
key
,
_
in
groupby_dataset
]
else
:
dataset_names
=
args
.
dataset_names
for
i
,
dataset_name
in
enumerate
(
dataset_names
):
if
accelerator
.
process_index
==
0
:
logger
.
info
(
f
"Evaluating
{
dataset_name
}
(
{
i
+
1
}
/
{
len
(
dataset_names
)
}
)..."
)
result_path
=
os
.
path
.
join
(
result_dir
,
f
"
{
dataset_name
}
.json"
)
if
args
.
load_result
and
os
.
path
.
exists
(
result_path
):
if
accelerator
.
process_index
==
0
:
with
open
(
result_path
,
encoding
=
"utf-8"
)
as
f
:
score
=
json
.
loads
(
f
.
readline
())
logger
.
info
(
f
"
{
dataset_name
}
:
{
score
}
"
)
metrics
[
dataset_name
]
=
score
else
:
dataset
=
datasets
.
Dataset
.
from_pandas
(
groupby_dataset
.
get_group
(
dataset_name
),
preserve_index
=
False
)
data_collator
=
DefaultDataCollator
(
tokenizer
=
tokenizer
)
dataloader
=
DataLoader
(
dataset
,
batch_size
=
args
.
batch_size
,
collate_fn
=
data_collator
,
# only pin memory when no gpu
pin_memory
=
not
args
.
cpu
,
)
dataloader
=
accelerator
.
prepare
(
dataloader
)
indices
=
[]
preds
=
[]
max_new_tokens
=
DATASET2MAXNEWTOKENS
[
dataset_name
]
for
i
,
x
in
enumerate
(
tqdm
(
dataloader
,
desc
=
"Generating"
)):
x
.
pop
(
"dataset"
)
idx
=
x
.
pop
(
"idx"
)[
0
]
input_length
=
x
[
"input_ids"
].
shape
[
1
]
# NOTE: important to reset memory for every batch
if
hasattr
(
model
,
"memory"
)
and
model
.
memory
is
not
None
:
model
.
memory
.
reset
()
# NOTE: very important to include \n as an eos token for QA and trec, otherwise the F1 score is devastating
if
dataset_name
in
[
"2wikimqa"
,
"hotpotqa"
,
"musique"
,
"multifieldqa_en"
,
"qasper"
,
"narrativeqa"
,
"samsum"
]:
output
=
model
.
generate
(
**
x
,
max_new_tokens
=
max_new_tokens
,
num_beams
=
1
,
do_sample
=
False
,
eos_token_id
=
[
tokenizer
.
eos_token_id
,
tokenizer
.
encode
(
"
\n
"
,
add_special_tokens
=
False
)[
-
1
]],
# prevent warning
temperature
=
1.0
,
top_p
=
1.0
,
)
else
:
output
=
model
.
generate
(
**
x
,
max_new_tokens
=
max_new_tokens
,
num_beams
=
1
,
do_sample
=
False
,
temperature
=
1.0
,
top_p
=
1.0
,
)
# 1, max_new_tokens
output
=
output
[:,
input_length
:]
# pad across device to the same length
output
=
accelerator
.
pad_across_processes
(
output
.
contiguous
(),
pad_index
=
tokenizer
.
pad_token_id
,
dim
=
1
)
# num_device, max_new_tokens
output
=
accelerator
.
gather_for_metrics
(
output
)
idx
=
accelerator
.
gather_for_metrics
(
idx
).
tolist
()
if
accelerator
.
process_index
==
0
:
pred
=
tokenizer
.
batch_decode
(
output
,
skip_special_tokens
=
True
)
preds
.
extend
(
pred
)
if
isinstance
(
idx
,
list
):
indices
.
extend
(
idx
)
else
:
# single process
indices
.
append
(
idx
)
if
accelerator
.
process_index
==
0
:
raw_dataset_subset
=
raw_dataset
[
indices
]
answers
=
raw_dataset_subset
[
"answers"
]
lengths
=
raw_dataset_subset
[
"length"
]
all_classes
=
raw_dataset_subset
[
"all_classes"
][
0
]
if
'.e.'
in
args
.
eval_data
:
score
=
scorer_e
(
dataset_name
,
preds
,
answers
,
lengths
,
all_classes
)
else
:
score
=
scorer
(
dataset_name
,
preds
,
answers
,
all_classes
)
logger
.
info
(
f
"
{
dataset_name
}
:
{
score
}
"
)
metrics
[
dataset_name
]
=
score
with
open
(
makedirs
(
result_path
),
"w"
,
encoding
=
"utf-8"
)
as
f
:
f
.
write
(
json
.
dumps
(
score
,
ensure_ascii
=
False
)
+
"
\n
"
)
for
idx
,
pred
in
zip
(
indices
,
preds
):
sample
=
raw_dataset
[
idx
]
del
sample
[
"all_classes"
]
del
sample
[
"context"
]
del
sample
[
"language"
]
del
sample
[
"_id"
]
sample
[
"pred"
]
=
pred
f
.
write
(
json
.
dumps
(
sample
,
ensure_ascii
=
False
)
+
"
\n
"
)
if
accelerator
.
process_index
==
0
:
log_path
=
os
.
path
.
join
(
args
.
output_dir
,
"metrics.log"
)
# compute category score
category_metrics
=
defaultdict
(
list
)
for
dataset
,
metric
in
metrics
.
items
():
category
=
DATASET2CATEGORY
[
dataset
]
category_metrics
[
category
].
append
(
metric
)
for
k
,
v
in
category_metrics
.
items
():
# when evaluating on longbench_e, each metric is a dict of float
if
isinstance
(
v
[
0
],
dict
):
category_metric
=
{}
for
kk
in
v
[
0
].
keys
():
vv
=
[
v
[
j
][
kk
]
for
j
in
range
(
len
(
v
))]
category_metric
[
kk
]
=
round
(
sum
(
vv
)
/
len
(
vv
),
2
)
category_metrics
[
k
]
=
category_metric
else
:
category_metrics
[
k
]
=
round
(
sum
(
v
)
/
len
(
v
),
2
)
# compute average score
if
isinstance
(
next
(
iter
(
metrics
.
values
())),
dict
):
avg
=
defaultdict
(
list
)
for
k
,
v
in
metrics
.
items
():
for
kk
,
vv
in
v
.
items
():
avg
[
kk
].
append
(
vv
)
for
k
,
v
in
avg
.
items
():
avg
[
k
]
=
round
(
sum
(
v
)
/
len
(
v
),
2
)
else
:
avg
=
round
(
sum
(
metrics
.
values
())
/
len
(
metrics
),
2
)
metrics
[
"avg"
]
=
avg
file_logger
=
FileLogger
(
makedirs
(
log_path
))
file_logger
.
log
(
metrics
,
Args
=
asdict
(
args
),
Category_Metrics
=
category_metrics
)
if
__name__
==
"__main__"
:
main
()
Long_LLM/activation_beacon/old/main/eval_longeval.py
0 → 100644
View file @
f75058c7
# modified based on https://github.com/DachengLi1/LongChat/blob/longeval/longeval/eval.py
import
os
import
torch
import
datasets
from
tqdm
import
tqdm
from
typing
import
List
from
accelerate
import
Accelerator
from
transformers
import
HfArgumentParser
from
transformers.utils
import
logging
from
torch.utils.data
import
DataLoader
from
dataclasses
import
dataclass
,
field
,
asdict
from
src
import
ModelArgs
,
DatasetProcessFn
,
DefaultDataCollator
,
FileLogger
,
get_model_and_tokenizer
,
makedirs
,
split_file_dir_name_ext
from
.longbench_utils
import
qa_f1_score
logger
=
logging
.
get_logger
(
__name__
)
@
dataclass
class
Args
(
ModelArgs
):
eval_data
:
str
=
field
(
default
=
"activation-beacon:longeval/topic_retrieval.json"
,
metadata
=
{
'help'
:
'Evaluation json data.'
}
)
output_dir
:
str
=
field
(
default
=
"data/results/longeval/"
,
metadata
=
{
'help'
:
'Output directory for results and logs.'
}
)
topic_num
:
List
[
int
]
=
field
(
default_factory
=
lambda
:
[
5
,
10
,
15
,
20
,
25
],
metadata
=
{
'help'
:
'How many topics to in the conversation?'
}
)
line_num
:
List
[
int
]
=
field
(
default_factory
=
lambda
:
[
200
,
300
,
400
,
500
,
600
,
680
],
metadata
=
{
'help'
:
'How many lines to in the conversation?'
}
)
def
process_longeval
(
tokenizer
,
data_type
,
topic_num
,
line_num
):
@
DatasetProcessFn
()
def
_process
(
prompt
,
target
,
num_topics
=
None
,
num_lines
=
None
,
**
kwds
):
# filter out samples that do not have proper number of topics/lines
if
data_type
==
"topic"
and
num_topics
not
in
topic_num
:
return
elif
data_type
==
"line"
and
num_lines
not
in
line_num
:
return
else
:
inputs
=
tokenizer
(
prompt
)
inputs
[
"target"
]
=
target
inputs
[
"input_length"
]
=
len
(
inputs
.
input_ids
)
if
num_topics
is
not
None
:
inputs
[
"num"
]
=
num_topics
elif
num_lines
is
not
None
:
inputs
[
"num"
]
=
num_lines
else
:
raise
ValueError
(
f
"Either num_topics or num_lines must appear in the dataset field!"
)
return
inputs
return
_process
@
torch
.
no_grad
()
def
main
():
parser
=
HfArgumentParser
([
Args
])
args
:
Args
=
parser
.
parse_args_into_dataclasses
()[
0
]
# topic or line
data_type
=
split_file_dir_name_ext
(
args
.
eval_data
)[
1
].
split
(
"_"
)[
0
]
accelerator
=
Accelerator
(
cpu
=
args
.
cpu
)
model
,
tokenizer
=
get_model_and_tokenizer
(
args
,
accelerator
=
accelerator
)
with
accelerator
.
main_process_first
():
process_fn
=
process_longeval
(
tokenizer
,
data_type
=
data_type
,
topic_num
=
args
.
topic_num
,
line_num
=
args
.
line_num
)
raw_dataset
=
datasets
.
load_dataset
(
"json"
,
data_files
=
args
.
eval_data
,
cache_dir
=
args
.
dataset_cache_dir
,
split
=
"train"
)
dataset
=
raw_dataset
.
map
(
process_fn
,
batched
=
True
,
num_proc
=
32
,
remove_columns
=
raw_dataset
.
column_names
)
groupby_dataset
=
dataset
.
to_pandas
().
groupby
(
"num"
)
data_collator
=
DefaultDataCollator
(
tokenizer
=
tokenizer
)
metrics
=
{}
for
num
,
dataset
in
groupby_dataset
:
dataset
=
datasets
.
Dataset
.
from_pandas
(
groupby_dataset
.
get_group
(
num
),
preserve_index
=
False
)
all_targets
=
dataset
[
"target"
]
# remove unnecessary columns
dataset
=
dataset
.
remove_columns
([
"target"
,
"num"
])
dataloader
=
DataLoader
(
dataset
,
batch_size
=
args
.
batch_size
,
collate_fn
=
data_collator
,
# only pin memory when no gpu
pin_memory
=
not
args
.
cpu
,
)
# shard dataloader
dataloader
=
accelerator
.
prepare
(
dataloader
)
all_lengths
=
[]
all_outputs
=
[]
for
i
,
x
in
enumerate
(
tqdm
(
dataloader
,
desc
=
f
"Evaluating
{
num
}
{
data_type
}
"
)):
# NOTE: important to reset memory for every batch
if
hasattr
(
model
,
"memory"
)
and
model
.
memory
is
not
None
:
model
.
memory
.
reset
()
input_length
=
x
.
pop
(
"input_length"
)
outputs
=
model
.
generate
(
**
x
,
max_new_tokens
=
50
,
do_sample
=
False
,
num_beams
=
1
,
temperature
=
1.0
,
top_p
=
1.0
,
)
start_idx
=
x
[
"input_ids"
].
shape
[
1
]
outputs
=
outputs
[:,
start_idx
:]
# must be contiguous
outputs
=
outputs
.
contiguous
()
outputs
=
accelerator
.
pad_across_processes
(
outputs
,
pad_index
=
tokenizer
.
pad_token_id
,
dim
=
1
)
outputs
=
accelerator
.
gather_for_metrics
(
outputs
).
tolist
()
input_length
=
accelerator
.
gather_for_metrics
(
input_length
).
tolist
()
outputs
=
tokenizer
.
batch_decode
(
outputs
,
skip_special_tokens
=
True
)
all_outputs
.
extend
(
outputs
)
all_lengths
.
extend
(
input_length
)
# if accelerator.process_index == 0:
# print(f"Target:{all_targets[i]}\nPred:{repr(all_outputs[i])}")
accuracy
=
0
f1
=
0
for
output
,
target
in
zip
(
all_outputs
,
all_targets
):
if
target
.
lower
()
in
output
.
lower
():
accuracy
+=
1
else
:
accuracy
+=
0
f1
+=
round
(
qa_f1_score
(
output
,
target
),
4
)
accuracy
/=
len
(
all_outputs
)
f1
/=
len
(
all_outputs
)
length
=
int
(
sum
(
all_lengths
)
/
len
(
all_lengths
))
metrics
[
length
]
=
{
"accuracy"
:
accuracy
,
"f1"
:
f1
,
}
# if accelerator.process_index == 0:
# print(f"Accuracy of {num} {data_type}s (average length {length}): {accuracy}")
if
accelerator
.
process_index
==
0
:
log_path
=
os
.
path
.
join
(
args
.
output_dir
,
f
"
{
data_type
}
_retrieval.log"
)
file_logger
=
FileLogger
(
makedirs
(
log_path
))
file_logger
.
log
(
metrics
,
Args
=
asdict
(
args
))
if
__name__
==
"__main__"
:
main
()
Prev
1
…
12
13
14
15
16
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment