Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
chenych
llama-recipes
Commits
5eaaba41
Commit
5eaaba41
authored
May 24, 2024
by
Rayyyyy
Browse files
First add in 0524
parents
Pipeline
#1017
failed with stages
in 0 seconds
Changes
202
Pipelines
1
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
208 additions
and
0 deletions
+208
-0
src/tests/test_sampler.py
src/tests/test_sampler.py
+87
-0
src/tests/test_train_utils.py
src/tests/test_train_utils.py
+121
-0
No files found.
src/tests/test_sampler.py
0 → 100644
View file @
5eaaba41
# Copyright (c) Meta Platforms, Inc. and affiliates.
# This software may be used and distributed according to the terms of the Llama 2 Community License Agreement.
import
random
import
pytest
import
torch
from
llama_recipes.data.sampler
import
LengthBasedBatchSampler
from
llama_recipes.data.sampler
import
DistributedLengthBasedBatchSampler
SAMPLES
=
33
@
pytest
.
fixture
def
dataset
():
random
.
seed
(
42
)
dataset
=
[]
def
add_samples
(
ds
,
n
,
a
,
b
):
for
_
in
range
(
n
):
ds
.
append
(
random
.
randint
(
a
,
b
)
*
[
1
,])
add_samples
(
dataset
,
SAMPLES
//
2
,
1
,
9
)
add_samples
(
dataset
,
(
SAMPLES
//
2
)
+
(
SAMPLES
%
2
),
10
,
20
)
return
random
.
sample
(
dataset
,
len
(
dataset
))
@
pytest
.
mark
.
parametrize
(
"batch_size, drop_last"
,
[(
2
,
False
),
(
8
,
False
),
(
2
,
True
),
(
8
,
True
)])
def
test_batch_sampler_array
(
dataset
,
batch_size
,
drop_last
):
sampler
=
LengthBasedBatchSampler
(
dataset
,
batch_size
,
drop_last
)
EXPECTED_LENGTH
=
SAMPLES
//
batch_size
if
drop_last
else
(
SAMPLES
//
batch_size
)
+
(
SAMPLES
%
batch_size
)
all_ids
=
[
i
for
b
in
sampler
for
i
in
b
]
assert
len
(
set
(
all_ids
))
==
EXPECTED_LENGTH
*
batch_size
if
drop_last
else
len
(
dataset
)
assert
len
(
sampler
)
==
EXPECTED_LENGTH
is_long
=
[
len
(
d
)
>=
10
for
d
in
dataset
]
def
check_batch
(
batch
):
return
all
(
batch
)
or
not
any
(
batch
)
assert
all
(
check_batch
(
is_long
[
i
]
for
i
in
b
)
for
b
in
sampler
)
@
pytest
.
mark
.
parametrize
(
"batch_size, drop_last"
,
[(
2
,
False
),
(
8
,
False
),
(
2
,
True
),
(
8
,
True
)])
def
test_batch_sampler_dict
(
dataset
,
batch_size
,
drop_last
):
dist_dataset
=
[{
"input_ids"
:
d
,
"attention_mask"
:
d
}
for
d
in
dataset
]
sampler
=
LengthBasedBatchSampler
(
dist_dataset
,
batch_size
,
drop_last
)
EXPECTED_LENGTH
=
SAMPLES
//
batch_size
if
drop_last
else
(
SAMPLES
//
batch_size
)
+
(
SAMPLES
%
batch_size
)
assert
len
(
sampler
)
==
EXPECTED_LENGTH
is_long
=
[
len
(
d
)
>=
10
for
d
in
dataset
]
def
check_batch
(
batch
):
return
all
(
batch
)
or
not
any
(
batch
)
assert
all
(
check_batch
(
is_long
[
i
]
for
i
in
b
)
for
b
in
sampler
)
@
pytest
.
mark
.
parametrize
(
"batch_size"
,
[
2
,
8
])
def
test_dist_batch_sampling
(
dataset
,
batch_size
):
sampler_1
=
DistributedLengthBasedBatchSampler
(
dataset
,
batch_size
=
batch_size
,
rank
=
0
,
num_replicas
=
2
,
shuffle
=
False
,
)
sampler_2
=
DistributedLengthBasedBatchSampler
(
dataset
,
batch_size
=
batch_size
,
rank
=
1
,
num_replicas
=
2
,
shuffle
=
False
,
)
ids_1
=
set
(
i
for
b
in
sampler_1
for
i
in
b
)
ids_2
=
set
(
i
for
b
in
sampler_2
for
i
in
b
)
assert
ids_1
.
isdisjoint
(
ids_2
)
assert
len
(
ids_1
)
+
len
(
ids_2
)
>
0
assert
len
(
ids_1
)
+
len
(
ids_2
)
==
len
(
dataset
)
//
batch_size
*
batch_size
\ No newline at end of file
src/tests/test_train_utils.py
0 → 100644
View file @
5eaaba41
# Copyright (c) Meta Platforms, Inc. and affiliates.
# This software may be used and distributed according to the terms of the Llama 2 Community License Agreement.
from
unittest.mock
import
patch
import
pytest
import
torch
import
os
import
shutil
from
llama_recipes.utils.train_utils
import
train
TEMP_OUTPUT_DIR
=
os
.
getcwd
()
+
"/tmp"
@
pytest
.
fixture
(
scope
=
"session"
)
def
temp_output_dir
():
# Create the directory during the session-level setup
temp_output_dir
=
"tmp"
os
.
mkdir
(
os
.
path
.
join
(
os
.
getcwd
(),
temp_output_dir
))
yield
temp_output_dir
# Delete the directory during the session-level teardown
shutil
.
rmtree
(
temp_output_dir
)
@
patch
(
"llama_recipes.utils.train_utils.MemoryTrace"
)
@
patch
(
"llama_recipes.utils.train_utils.nullcontext"
)
@
patch
(
"llama_recipes.utils.train_utils.torch.cuda.amp.GradScaler"
)
@
patch
(
"llama_recipes.utils.train_utils.torch.cuda.amp.autocast"
)
def
test_gradient_accumulation
(
autocast
,
scaler
,
nullcontext
,
mem_trace
,
mocker
):
model
=
mocker
.
MagicMock
(
name
=
"model"
)
model
().
loss
.
__truediv__
().
detach
.
return_value
=
torch
.
tensor
(
1
)
mock_tensor
=
mocker
.
MagicMock
(
name
=
"tensor"
)
batch
=
{
"input"
:
mock_tensor
}
train_dataloader
=
[
batch
,
batch
,
batch
,
batch
,
batch
]
eval_dataloader
=
None
tokenizer
=
mocker
.
MagicMock
()
optimizer
=
mocker
.
MagicMock
()
lr_scheduler
=
mocker
.
MagicMock
()
gradient_accumulation_steps
=
1
train_config
=
mocker
.
MagicMock
()
train_config
.
enable_fsdp
=
False
train_config
.
use_fp16
=
False
train_config
.
run_validation
=
False
train_config
.
gradient_clipping
=
False
train_config
.
max_train_step
=
0
train_config
.
max_eval_step
=
0
train_config
.
save_metrics
=
False
train
(
model
,
train_dataloader
,
eval_dataloader
,
tokenizer
,
optimizer
,
lr_scheduler
,
gradient_accumulation_steps
,
train_config
,
)
assert
optimizer
.
zero_grad
.
call_count
==
5
optimizer
.
zero_grad
.
reset_mock
()
assert
nullcontext
.
call_count
==
5
nullcontext
.
reset_mock
()
assert
autocast
.
call_count
==
0
gradient_accumulation_steps
=
2
train_config
.
use_fp16
=
True
train
(
model
,
train_dataloader
,
eval_dataloader
,
tokenizer
,
optimizer
,
lr_scheduler
,
gradient_accumulation_steps
,
train_config
,
)
assert
optimizer
.
zero_grad
.
call_count
==
3
assert
nullcontext
.
call_count
==
0
assert
autocast
.
call_count
==
5
def
test_save_to_json
(
temp_output_dir
,
mocker
):
model
=
mocker
.
MagicMock
(
name
=
"model"
)
model
().
loss
.
__truediv__
().
detach
.
return_value
=
torch
.
tensor
(
1
)
mock_tensor
=
mocker
.
MagicMock
(
name
=
"tensor"
)
batch
=
{
"input"
:
mock_tensor
}
train_dataloader
=
[
batch
,
batch
,
batch
,
batch
,
batch
]
eval_dataloader
=
None
tokenizer
=
mocker
.
MagicMock
()
optimizer
=
mocker
.
MagicMock
()
lr_scheduler
=
mocker
.
MagicMock
()
gradient_accumulation_steps
=
1
train_config
=
mocker
.
MagicMock
()
train_config
.
enable_fsdp
=
False
train_config
.
use_fp16
=
False
train_config
.
run_validation
=
False
train_config
.
gradient_clipping
=
False
train_config
.
save_metrics
=
True
train_config
.
max_train_step
=
0
train_config
.
max_eval_step
=
0
train_config
.
output_dir
=
temp_output_dir
train_config
.
use_profiler
=
False
results
=
train
(
model
,
train_dataloader
,
eval_dataloader
,
tokenizer
,
optimizer
,
lr_scheduler
,
gradient_accumulation_steps
,
train_config
,
local_rank
=
0
)
assert
results
[
"metrics_filename"
]
not
in
[
""
,
None
]
assert
os
.
path
.
isfile
(
results
[
"metrics_filename"
])
Prev
1
…
7
8
9
10
11
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment