Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
zhougaofeng
internlm2-math-7B
Commits
ac9cc51b
Commit
ac9cc51b
authored
Jun 11, 2024
by
zhougaofeng
Browse files
Upload New File
parent
9f97f576
Pipeline
#1103
canceled with stages
Changes
1
Pipelines
1
Show whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
51 additions
and
0 deletions
+51
-0
finetune/scripts/length_cdf.py
finetune/scripts/length_cdf.py
+51
-0
No files found.
finetune/scripts/length_cdf.py
0 → 100644
View file @
ac9cc51b
# coding=utf-8
# Calculates the distribution of the input lengths in the dataset.
# Usage: python length_cdf.py --model_name_or_path path_to_model --dataset alpaca_en --template default
from
collections
import
defaultdict
import
fire
from
tqdm
import
tqdm
from
llamafactory.data
import
get_dataset
from
llamafactory.hparams
import
get_train_args
from
llamafactory.model
import
load_tokenizer
def
length_cdf
(
model_name_or_path
:
str
,
dataset
:
str
=
"alpaca_en"
,
dataset_dir
:
str
=
"data"
,
template
:
str
=
"default"
,
interval
:
int
=
1000
,
):
model_args
,
data_args
,
training_args
,
_
,
_
=
get_train_args
(
dict
(
stage
=
"sft"
,
model_name_or_path
=
model_name_or_path
,
dataset
=
dataset
,
dataset_dir
=
dataset_dir
,
template
=
template
,
cutoff_len
=
1_000_000
,
output_dir
=
"dummy_dir"
,
overwrite_cache
=
True
,
)
)
tokenizer_module
=
load_tokenizer
(
model_args
)
trainset
=
get_dataset
(
model_args
,
data_args
,
training_args
,
stage
=
"sft"
,
**
tokenizer_module
)
total_num
=
len
(
trainset
)
length_dict
=
defaultdict
(
int
)
for
sample
in
tqdm
(
trainset
[
"input_ids"
]):
length_dict
[
len
(
sample
)
//
interval
*
interval
]
+=
1
length_tuples
=
list
(
length_dict
.
items
())
length_tuples
.
sort
()
count_accu
,
prob_accu
=
0
,
0
for
length
,
count
in
length_tuples
:
count_accu
+=
count
prob_accu
+=
count
/
total_num
*
100
print
(
"{:d} ({:.2f}%) samples have length < {}."
.
format
(
count_accu
,
prob_accu
,
length
+
interval
))
if
__name__
==
"__main__"
:
fire
.
Fire
(
length_cdf
)
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment