Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
chenpangpang
transformers
Commits
5e8c8eb5
"docs/source/en/serialization.mdx" did not exist on "207594be81b8e5a8589c8b11c3b236924555d806"
Unverified
Commit
5e8c8eb5
authored
Feb 22, 2023
by
Aaron Gokaslan
Committed by
GitHub
Feb 22, 2023
Browse files
Apply ruff flake8-comprehensions (#21694)
parent
df06fb1f
Changes
230
Show whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
42 additions
and
42 deletions
+42
-42
examples/pytorch/contrastive-image-text/run_clip.py
examples/pytorch/contrastive-image-text/run_clip.py
+1
-1
examples/pytorch/image-classification/run_image_classification.py
.../pytorch/image-classification/run_image_classification.py
+1
-1
examples/pytorch/image-pretraining/run_mae.py
examples/pytorch/image-pretraining/run_mae.py
+1
-1
examples/pytorch/image-pretraining/run_mim.py
examples/pytorch/image-pretraining/run_mim.py
+1
-1
examples/pytorch/language-modeling/run_clm.py
examples/pytorch/language-modeling/run_clm.py
+1
-1
examples/pytorch/multiple-choice/run_swag.py
examples/pytorch/multiple-choice/run_swag.py
+8
-8
examples/pytorch/semantic-segmentation/run_semantic_segmentation.py
...ytorch/semantic-segmentation/run_semantic_segmentation.py
+2
-2
examples/pytorch/semantic-segmentation/run_semantic_segmentation_no_trainer.py
...ntic-segmentation/run_semantic_segmentation_no_trainer.py
+2
-2
examples/pytorch/speech-recognition/run_speech_recognition_ctc.py
.../pytorch/speech-recognition/run_speech_recognition_ctc.py
+1
-1
examples/pytorch/text-classification/run_glue.py
examples/pytorch/text-classification/run_glue.py
+2
-2
examples/pytorch/text-classification/run_glue_no_trainer.py
examples/pytorch/text-classification/run_glue_no_trainer.py
+2
-2
examples/pytorch/token-classification/run_ner.py
examples/pytorch/token-classification/run_ner.py
+3
-3
examples/pytorch/token-classification/run_ner_no_trainer.py
examples/pytorch/token-classification/run_ner_no_trainer.py
+3
-3
examples/research_projects/bert-loses-patience/run_glue_with_pabee.py
...earch_projects/bert-loses-patience/run_glue_with_pabee.py
+3
-3
examples/research_projects/bertabs/modeling_bertabs.py
examples/research_projects/bertabs/modeling_bertabs.py
+1
-1
examples/research_projects/bertology/run_bertology.py
examples/research_projects/bertology/run_bertology.py
+3
-3
examples/research_projects/bertology/run_prune_gpt.py
examples/research_projects/bertology/run_prune_gpt.py
+3
-3
examples/research_projects/codeparrot/scripts/minhash_deduplication.py
...arch_projects/codeparrot/scripts/minhash_deduplication.py
+2
-2
examples/research_projects/codeparrot/scripts/preprocessing.py
...les/research_projects/codeparrot/scripts/preprocessing.py
+1
-1
examples/research_projects/codeparrot/scripts/pretokenizing.py
...les/research_projects/codeparrot/scripts/pretokenizing.py
+1
-1
No files found.
examples/pytorch/contrastive-image-text/run_clip.py
View file @
5e8c8eb5
...
...
@@ -397,7 +397,7 @@ def main():
# Preprocessing the datasets.
# We need to tokenize input captions and transform the images.
def
tokenize_captions
(
examples
):
captions
=
[
caption
for
caption
in
examples
[
caption_column
]
]
captions
=
list
(
examples
[
caption_column
]
)
text_inputs
=
tokenizer
(
captions
,
max_length
=
data_args
.
max_seq_length
,
padding
=
"max_length"
,
truncation
=
True
)
examples
[
"input_ids"
]
=
text_inputs
.
input_ids
examples
[
"attention_mask"
]
=
text_inputs
.
attention_mask
...
...
examples/pytorch/image-classification/run_image_classification.py
View file @
5e8c8eb5
...
...
@@ -250,7 +250,7 @@ def main():
# Prepare label mappings.
# We'll include these in the model's config to get human readable labels in the Inference API.
labels
=
dataset
[
"train"
].
features
[
"labels"
].
names
label2id
,
id2label
=
dict
(),
dict
()
label2id
,
id2label
=
{},
{}
for
i
,
label
in
enumerate
(
labels
):
label2id
[
label
]
=
str
(
i
)
id2label
[
str
(
i
)]
=
label
...
...
examples/pytorch/image-pretraining/run_mae.py
View file @
5e8c8eb5
...
...
@@ -91,7 +91,7 @@ class DataTrainingArguments:
)
def
__post_init__
(
self
):
data_files
=
dict
()
data_files
=
{}
if
self
.
train_dir
is
not
None
:
data_files
[
"train"
]
=
self
.
train_dir
if
self
.
validation_dir
is
not
None
:
...
...
examples/pytorch/image-pretraining/run_mim.py
View file @
5e8c8eb5
...
...
@@ -104,7 +104,7 @@ class DataTrainingArguments:
)
def
__post_init__
(
self
):
data_files
=
dict
()
data_files
=
{}
if
self
.
train_dir
is
not
None
:
data_files
[
"train"
]
=
self
.
train_dir
if
self
.
validation_dir
is
not
None
:
...
...
examples/pytorch/language-modeling/run_clm.py
View file @
5e8c8eb5
...
...
@@ -407,7 +407,7 @@ def main():
)
else
:
model
=
AutoModelForCausalLM
.
from_config
(
config
)
n_params
=
sum
(
dict
((
p
.
data_ptr
()
,
p
.
numel
()
)
for
p
in
model
.
parameters
()
)
.
values
())
n_params
=
sum
(
{
p
.
data_ptr
()
:
p
.
numel
()
for
p
in
model
.
parameters
()
}
.
values
())
logger
.
info
(
f
"Training new model from scratch - Total size=
{
n_params
/
2
**
20
:.
2
f
}
M params"
)
# We resize the embeddings only when necessary to avoid index errors. If you are creating a model from scratch
...
...
examples/pytorch/multiple-choice/run_swag.py
View file @
5e8c8eb5
...
...
@@ -457,14 +457,14 @@ def main():
trainer
.
log_metrics
(
"eval"
,
metrics
)
trainer
.
save_metrics
(
"eval"
,
metrics
)
kwargs
=
dict
(
finetuned_from
=
model_args
.
model_name_or_path
,
tasks
=
"multiple-choice"
,
dataset_tags
=
"swag"
,
dataset_args
=
"regular"
,
dataset
=
"SWAG"
,
language
=
"en"
,
)
kwargs
=
{
"
finetuned_from
"
:
model_args
.
model_name_or_path
,
"
tasks
"
:
"multiple-choice"
,
"
dataset_tags
"
:
"swag"
,
"
dataset_args
"
:
"regular"
,
"
dataset
"
:
"SWAG"
,
"
language
"
:
"en"
,
}
if
training_args
.
push_to_hub
:
trainer
.
push_to_hub
(
**
kwargs
)
...
...
examples/pytorch/semantic-segmentation/run_semantic_segmentation.py
View file @
5e8c8eb5
...
...
@@ -430,7 +430,7 @@ def main():
pixel_values
.
append
(
image
)
labels
.
append
(
target
)
encoding
=
dict
()
encoding
=
{}
encoding
[
"pixel_values"
]
=
torch
.
stack
(
pixel_values
)
encoding
[
"labels"
]
=
torch
.
stack
(
labels
)
...
...
@@ -444,7 +444,7 @@ def main():
pixel_values
.
append
(
image
)
labels
.
append
(
target
)
encoding
=
dict
()
encoding
=
{}
encoding
[
"pixel_values"
]
=
torch
.
stack
(
pixel_values
)
encoding
[
"labels"
]
=
torch
.
stack
(
labels
)
...
...
examples/pytorch/semantic-segmentation/run_semantic_segmentation_no_trainer.py
View file @
5e8c8eb5
...
...
@@ -441,7 +441,7 @@ def main():
pixel_values
.
append
(
image
)
labels
.
append
(
target
)
encoding
=
dict
()
encoding
=
{}
encoding
[
"pixel_values"
]
=
torch
.
stack
(
pixel_values
)
encoding
[
"labels"
]
=
torch
.
stack
(
labels
)
...
...
@@ -455,7 +455,7 @@ def main():
pixel_values
.
append
(
image
)
labels
.
append
(
target
)
encoding
=
dict
()
encoding
=
{}
encoding
[
"pixel_values"
]
=
torch
.
stack
(
pixel_values
)
encoding
[
"labels"
]
=
torch
.
stack
(
labels
)
...
...
examples/pytorch/speech-recognition/run_speech_recognition_ctc.py
View file @
5e8c8eb5
...
...
@@ -349,7 +349,7 @@ def create_vocabulary_from_data(
lambda
vocab_1
,
vocab_2
:
set
(
vocab_1
[
"vocab"
][
0
])
|
set
(
vocab_2
[
"vocab"
][
0
]),
vocabs
.
values
()
)
vocab_dict
=
{
v
:
k
for
k
,
v
in
enumerate
(
sorted
(
list
(
vocab_set
))
)
}
vocab_dict
=
{
v
:
k
for
k
,
v
in
enumerate
(
sorted
(
vocab_set
))}
# replace white space with delimiter token
if
word_delimiter_token
is
not
None
:
...
...
examples/pytorch/text-classification/run_glue.py
View file @
5e8c8eb5
...
...
@@ -406,12 +406,12 @@ def main():
):
# Some have all caps in their config, some don't.
label_name_to_id
=
{
k
.
lower
():
v
for
k
,
v
in
model
.
config
.
label2id
.
items
()}
if
list
(
sorted
(
label_name_to_id
.
keys
())
)
==
list
(
sorted
(
label_list
)
)
:
if
sorted
(
label_name_to_id
.
keys
())
==
sorted
(
label_list
):
label_to_id
=
{
i
:
int
(
label_name_to_id
[
label_list
[
i
]])
for
i
in
range
(
num_labels
)}
else
:
logger
.
warning
(
"Your model seems to have been trained with labels, but they don't match the dataset: "
,
f
"model labels:
{
list
(
sorted
(
label_name_to_id
.
keys
())
)
}
, dataset labels:
{
list
(
sorted
(
label_list
)
)
}
."
f
"model labels:
{
sorted
(
label_name_to_id
.
keys
())
}
, dataset labels:
{
sorted
(
label_list
)
}
."
"
\n
Ignoring the model labels as a result."
,
)
elif
data_args
.
task_name
is
None
and
not
is_regression
:
...
...
examples/pytorch/text-classification/run_glue_no_trainer.py
View file @
5e8c8eb5
...
...
@@ -339,7 +339,7 @@ def main():
):
# Some have all caps in their config, some don't.
label_name_to_id
=
{
k
.
lower
():
v
for
k
,
v
in
model
.
config
.
label2id
.
items
()}
if
list
(
sorted
(
label_name_to_id
.
keys
())
)
==
list
(
sorted
(
label_list
)
)
:
if
sorted
(
label_name_to_id
.
keys
())
==
sorted
(
label_list
):
logger
.
info
(
f
"The configuration of the model provided the following label correspondence:
{
label_name_to_id
}
. "
"Using it!"
...
...
@@ -348,7 +348,7 @@ def main():
else
:
logger
.
warning
(
"Your model seems to have been trained with labels, but they don't match the dataset: "
,
f
"model labels:
{
list
(
sorted
(
label_name_to_id
.
keys
())
)
}
, dataset labels:
{
list
(
sorted
(
label_list
)
)
}
."
f
"model labels:
{
sorted
(
label_name_to_id
.
keys
())
}
, dataset labels:
{
sorted
(
label_list
)
}
."
"
\n
Ignoring the model labels as a result."
,
)
elif
args
.
task_name
is
None
and
not
is_regression
:
...
...
examples/pytorch/token-classification/run_ner.py
View file @
5e8c8eb5
...
...
@@ -386,7 +386,7 @@ def main():
# Model has labels -> use them.
if
model
.
config
.
label2id
!=
PretrainedConfig
(
num_labels
=
num_labels
).
label2id
:
if
list
(
sorted
(
model
.
config
.
label2id
.
keys
())
)
==
list
(
sorted
(
label_list
)
)
:
if
sorted
(
model
.
config
.
label2id
.
keys
())
==
sorted
(
label_list
):
# Reorganize `label_list` to match the ordering of the model.
if
labels_are_int
:
label_to_id
=
{
i
:
int
(
model
.
config
.
label2id
[
l
])
for
i
,
l
in
enumerate
(
label_list
)}
...
...
@@ -397,8 +397,8 @@ def main():
else
:
logger
.
warning
(
"Your model seems to have been trained with labels, but they don't match the dataset: "
,
f
"model labels:
{
list
(
sorted
(
model
.
config
.
label2id
.
keys
())
)
}
, dataset labels:"
f
"
{
list
(
sorted
(
label_list
)
)
}
.
\n
Ignoring the model labels as a result."
,
f
"model labels:
{
sorted
(
model
.
config
.
label2id
.
keys
())
}
, dataset labels:"
f
"
{
sorted
(
label_list
)
}
.
\n
Ignoring the model labels as a result."
,
)
# Set the correspondences label/ID inside the model config
...
...
examples/pytorch/token-classification/run_ner_no_trainer.py
View file @
5e8c8eb5
...
...
@@ -425,7 +425,7 @@ def main():
# Model has labels -> use them.
if
model
.
config
.
label2id
!=
PretrainedConfig
(
num_labels
=
num_labels
).
label2id
:
if
list
(
sorted
(
model
.
config
.
label2id
.
keys
())
)
==
list
(
sorted
(
label_list
)
)
:
if
sorted
(
model
.
config
.
label2id
.
keys
())
==
sorted
(
label_list
):
# Reorganize `label_list` to match the ordering of the model.
if
labels_are_int
:
label_to_id
=
{
i
:
int
(
model
.
config
.
label2id
[
l
])
for
i
,
l
in
enumerate
(
label_list
)}
...
...
@@ -436,8 +436,8 @@ def main():
else
:
logger
.
warning
(
"Your model seems to have been trained with labels, but they don't match the dataset: "
,
f
"model labels:
{
list
(
sorted
(
model
.
config
.
label2id
.
keys
())
)
}
, dataset labels:"
f
"
{
list
(
sorted
(
label_list
)
)
}
.
\n
Ignoring the model labels as a result."
,
f
"model labels:
{
sorted
(
model
.
config
.
label2id
.
keys
())
}
, dataset labels:"
f
"
{
sorted
(
label_list
)
}
.
\n
Ignoring the model labels as a result."
,
)
# Set the correspondences label/ID inside the model config
...
...
examples/research_projects/bert-loses-patience/run_glue_with_pabee.py
View file @
5e8c8eb5
...
...
@@ -727,9 +727,9 @@ def main():
tokenizer
=
tokenizer_class
.
from_pretrained
(
args
.
output_dir
,
do_lower_case
=
args
.
do_lower_case
)
checkpoints
=
[
args
.
output_dir
]
if
args
.
eval_all_checkpoints
:
checkpoints
=
list
(
checkpoints
=
[
os
.
path
.
dirname
(
c
)
for
c
in
sorted
(
glob
.
glob
(
args
.
output_dir
+
"/**/"
+
WEIGHTS_NAME
,
recursive
=
True
))
)
]
logger
.
info
(
"Evaluate the following checkpoints: %s"
,
checkpoints
)
...
...
@@ -743,7 +743,7 @@ def main():
print
(
f
"Evaluation for checkpoint
{
prefix
}
"
)
for
patience
in
patience_list
:
result
=
evaluate
(
args
,
model
,
tokenizer
,
prefix
=
prefix
,
patience
=
patience
)
result
=
dict
((
k
+
"_{}"
.
format
(
global_step
)
,
v
)
for
k
,
v
in
result
.
items
()
)
result
=
{
k
+
"_{}"
.
format
(
global_step
)
:
v
for
k
,
v
in
result
.
items
()
}
results
.
update
(
result
)
return
results
...
...
examples/research_projects/bertabs/modeling_bertabs.py
View file @
5e8c8eb5
...
...
@@ -54,7 +54,7 @@ class BertAbs(BertAbsPreTrainedModel):
load_bert_pretrained_extractive
=
True
if
bert_extractive_checkpoint
else
False
if
load_bert_pretrained_extractive
:
self
.
bert
.
model
.
load_state_dict
(
dict
([(
n
[
11
:]
,
p
)
for
n
,
p
in
bert_extractive_checkpoint
.
items
()
if
n
.
startswith
(
"bert.model"
)
])
,
{
n
[
11
:]
:
p
for
n
,
p
in
bert_extractive_checkpoint
.
items
()
if
n
.
startswith
(
"bert.model"
)
}
,
strict
=
True
,
)
...
...
examples/research_projects/bertology/run_bertology.py
View file @
5e8c8eb5
...
...
@@ -218,9 +218,9 @@ def prune_heads(args, model, eval_dataloader, head_mask):
original_time
=
datetime
.
now
()
-
before_time
original_num_params
=
sum
(
p
.
numel
()
for
p
in
model
.
parameters
())
heads_to_prune
=
dict
(
(
layer
,
(
1
-
head_mask
[
layer
].
long
()).
nonzero
().
squeeze
().
tolist
()
)
for
layer
in
range
(
len
(
head_mask
))
)
heads_to_prune
=
{
layer
:
(
1
-
head_mask
[
layer
].
long
()).
nonzero
().
squeeze
().
tolist
()
for
layer
in
range
(
len
(
head_mask
))
}
assert
sum
(
len
(
h
)
for
h
in
heads_to_prune
.
values
())
==
(
1
-
head_mask
.
long
()).
sum
().
item
()
model
.
prune_heads
(
heads_to_prune
)
...
...
examples/research_projects/bertology/run_prune_gpt.py
View file @
5e8c8eb5
...
...
@@ -194,9 +194,9 @@ def prune_heads(args, model, eval_dataloader, head_mask):
original_time
=
datetime
.
now
()
-
before_time
original_num_params
=
sum
(
p
.
numel
()
for
p
in
model
.
parameters
())
heads_to_prune
=
dict
(
(
layer
,
(
1
-
head_mask
[
layer
].
long
()).
nonzero
().
squeeze
().
tolist
()
)
for
layer
in
range
(
len
(
head_mask
))
)
heads_to_prune
=
{
layer
:
(
1
-
head_mask
[
layer
].
long
()).
nonzero
().
squeeze
().
tolist
()
for
layer
in
range
(
len
(
head_mask
))
}
for
k
,
v
in
heads_to_prune
.
items
():
if
isinstance
(
v
,
int
):
...
...
examples/research_projects/codeparrot/scripts/minhash_deduplication.py
View file @
5e8c8eb5
...
...
@@ -29,7 +29,7 @@ def get_min_hash(tokens: List[str]) -> Optional[MinHash]:
def
get_tokens
(
code
:
str
)
->
Set
[
str
]:
"""Tokenize a code snippet."""
return
set
([
t
for
t
in
NON_ALPHA
.
split
(
code
)
if
len
(
t
.
strip
())
>
0
])
return
{
t
for
t
in
NON_ALPHA
.
split
(
code
)
if
len
(
t
.
strip
())
>
0
}
class
DuplicationIndex
:
...
...
@@ -243,7 +243,7 @@ def deduplicate_dataset(
>>> ds_dedup, duplicate_clusters = deduplicate_dataset(ds, jaccard_threshold=0.85)
"""
duplicate_clusters
=
make_duplicate_clusters
(
dataset
,
jaccard_threshold
)
duplicate_indices
=
set
(
x
[
"base_index"
]
for
cluster
in
duplicate_clusters
for
x
in
cluster
)
duplicate_indices
=
{
x
[
"base_index"
]
for
cluster
in
duplicate_clusters
for
x
in
cluster
}
extreme_dict
=
{}
extremes_clusters
=
find_extremes
(
duplicate_clusters
,
dataset
,
jaccard_threshold
)
for
extremes
in
extremes_clusters
:
...
...
examples/research_projects/codeparrot/scripts/preprocessing.py
View file @
5e8c8eb5
...
...
@@ -114,7 +114,7 @@ def char_token_ratio(example):
def
preprocess
(
example
):
"""Chain all preprocessing steps into one function to not fill cache."""
results
=
dict
()
results
=
{}
results
.
update
(
get_hash
(
example
))
results
.
update
(
line_stats
(
example
))
results
.
update
(
alpha_stats
(
example
))
...
...
examples/research_projects/codeparrot/scripts/pretokenizing.py
View file @
5e8c8eb5
...
...
@@ -8,7 +8,7 @@ from transformers import AutoTokenizer, HfArgumentParser
def
tokenize
(
example
):
output
=
dict
()
output
=
{}
output
[
"input_ids"
]
=
tokenizer
(
example
[
"content"
],
truncation
=
False
)[
"input_ids"
]
output
[
"ratio_char_token"
]
=
len
(
example
[
"content"
])
/
len
(
output
[
"input_ids"
])
return
output
...
...
Prev
1
2
3
4
5
6
…
12
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment