Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
chenpangpang
transformers
Commits
5e8c8eb5
Unverified
Commit
5e8c8eb5
authored
Feb 22, 2023
by
Aaron Gokaslan
Committed by
GitHub
Feb 22, 2023
Browse files
Apply ruff flake8-comprehensions (#21694)
parent
df06fb1f
Changes
230
Show whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
42 additions
and
42 deletions
+42
-42
examples/pytorch/contrastive-image-text/run_clip.py
examples/pytorch/contrastive-image-text/run_clip.py
+1
-1
examples/pytorch/image-classification/run_image_classification.py
.../pytorch/image-classification/run_image_classification.py
+1
-1
examples/pytorch/image-pretraining/run_mae.py
examples/pytorch/image-pretraining/run_mae.py
+1
-1
examples/pytorch/image-pretraining/run_mim.py
examples/pytorch/image-pretraining/run_mim.py
+1
-1
examples/pytorch/language-modeling/run_clm.py
examples/pytorch/language-modeling/run_clm.py
+1
-1
examples/pytorch/multiple-choice/run_swag.py
examples/pytorch/multiple-choice/run_swag.py
+8
-8
examples/pytorch/semantic-segmentation/run_semantic_segmentation.py
...ytorch/semantic-segmentation/run_semantic_segmentation.py
+2
-2
examples/pytorch/semantic-segmentation/run_semantic_segmentation_no_trainer.py
...ntic-segmentation/run_semantic_segmentation_no_trainer.py
+2
-2
examples/pytorch/speech-recognition/run_speech_recognition_ctc.py
.../pytorch/speech-recognition/run_speech_recognition_ctc.py
+1
-1
examples/pytorch/text-classification/run_glue.py
examples/pytorch/text-classification/run_glue.py
+2
-2
examples/pytorch/text-classification/run_glue_no_trainer.py
examples/pytorch/text-classification/run_glue_no_trainer.py
+2
-2
examples/pytorch/token-classification/run_ner.py
examples/pytorch/token-classification/run_ner.py
+3
-3
examples/pytorch/token-classification/run_ner_no_trainer.py
examples/pytorch/token-classification/run_ner_no_trainer.py
+3
-3
examples/research_projects/bert-loses-patience/run_glue_with_pabee.py
...earch_projects/bert-loses-patience/run_glue_with_pabee.py
+3
-3
examples/research_projects/bertabs/modeling_bertabs.py
examples/research_projects/bertabs/modeling_bertabs.py
+1
-1
examples/research_projects/bertology/run_bertology.py
examples/research_projects/bertology/run_bertology.py
+3
-3
examples/research_projects/bertology/run_prune_gpt.py
examples/research_projects/bertology/run_prune_gpt.py
+3
-3
examples/research_projects/codeparrot/scripts/minhash_deduplication.py
...arch_projects/codeparrot/scripts/minhash_deduplication.py
+2
-2
examples/research_projects/codeparrot/scripts/preprocessing.py
...les/research_projects/codeparrot/scripts/preprocessing.py
+1
-1
examples/research_projects/codeparrot/scripts/pretokenizing.py
...les/research_projects/codeparrot/scripts/pretokenizing.py
+1
-1
No files found.
examples/pytorch/contrastive-image-text/run_clip.py
View file @
5e8c8eb5
...
...
@@ -397,7 +397,7 @@ def main():
# Preprocessing the datasets.
# We need to tokenize input captions and transform the images.
def
tokenize_captions
(
examples
):
captions
=
[
caption
for
caption
in
examples
[
caption_column
]
]
captions
=
list
(
examples
[
caption_column
]
)
text_inputs
=
tokenizer
(
captions
,
max_length
=
data_args
.
max_seq_length
,
padding
=
"max_length"
,
truncation
=
True
)
examples
[
"input_ids"
]
=
text_inputs
.
input_ids
examples
[
"attention_mask"
]
=
text_inputs
.
attention_mask
...
...
examples/pytorch/image-classification/run_image_classification.py
View file @
5e8c8eb5
...
...
@@ -250,7 +250,7 @@ def main():
# Prepare label mappings.
# We'll include these in the model's config to get human readable labels in the Inference API.
labels
=
dataset
[
"train"
].
features
[
"labels"
].
names
label2id
,
id2label
=
dict
(),
dict
()
label2id
,
id2label
=
{},
{}
for
i
,
label
in
enumerate
(
labels
):
label2id
[
label
]
=
str
(
i
)
id2label
[
str
(
i
)]
=
label
...
...
examples/pytorch/image-pretraining/run_mae.py
View file @
5e8c8eb5
...
...
@@ -91,7 +91,7 @@ class DataTrainingArguments:
)
def
__post_init__
(
self
):
data_files
=
dict
()
data_files
=
{}
if
self
.
train_dir
is
not
None
:
data_files
[
"train"
]
=
self
.
train_dir
if
self
.
validation_dir
is
not
None
:
...
...
examples/pytorch/image-pretraining/run_mim.py
View file @
5e8c8eb5
...
...
@@ -104,7 +104,7 @@ class DataTrainingArguments:
)
def
__post_init__
(
self
):
data_files
=
dict
()
data_files
=
{}
if
self
.
train_dir
is
not
None
:
data_files
[
"train"
]
=
self
.
train_dir
if
self
.
validation_dir
is
not
None
:
...
...
examples/pytorch/language-modeling/run_clm.py
View file @
5e8c8eb5
...
...
@@ -407,7 +407,7 @@ def main():
)
else
:
model
=
AutoModelForCausalLM
.
from_config
(
config
)
n_params
=
sum
(
dict
((
p
.
data_ptr
()
,
p
.
numel
()
)
for
p
in
model
.
parameters
()
)
.
values
())
n_params
=
sum
(
{
p
.
data_ptr
()
:
p
.
numel
()
for
p
in
model
.
parameters
()
}
.
values
())
logger
.
info
(
f
"Training new model from scratch - Total size=
{
n_params
/
2
**
20
:.
2
f
}
M params"
)
# We resize the embeddings only when necessary to avoid index errors. If you are creating a model from scratch
...
...
examples/pytorch/multiple-choice/run_swag.py
View file @
5e8c8eb5
...
...
@@ -457,14 +457,14 @@ def main():
trainer
.
log_metrics
(
"eval"
,
metrics
)
trainer
.
save_metrics
(
"eval"
,
metrics
)
kwargs
=
dict
(
finetuned_from
=
model_args
.
model_name_or_path
,
tasks
=
"multiple-choice"
,
dataset_tags
=
"swag"
,
dataset_args
=
"regular"
,
dataset
=
"SWAG"
,
language
=
"en"
,
)
kwargs
=
{
"
finetuned_from
"
:
model_args
.
model_name_or_path
,
"
tasks
"
:
"multiple-choice"
,
"
dataset_tags
"
:
"swag"
,
"
dataset_args
"
:
"regular"
,
"
dataset
"
:
"SWAG"
,
"
language
"
:
"en"
,
}
if
training_args
.
push_to_hub
:
trainer
.
push_to_hub
(
**
kwargs
)
...
...
examples/pytorch/semantic-segmentation/run_semantic_segmentation.py
View file @
5e8c8eb5
...
...
@@ -430,7 +430,7 @@ def main():
pixel_values
.
append
(
image
)
labels
.
append
(
target
)
encoding
=
dict
()
encoding
=
{}
encoding
[
"pixel_values"
]
=
torch
.
stack
(
pixel_values
)
encoding
[
"labels"
]
=
torch
.
stack
(
labels
)
...
...
@@ -444,7 +444,7 @@ def main():
pixel_values
.
append
(
image
)
labels
.
append
(
target
)
encoding
=
dict
()
encoding
=
{}
encoding
[
"pixel_values"
]
=
torch
.
stack
(
pixel_values
)
encoding
[
"labels"
]
=
torch
.
stack
(
labels
)
...
...
examples/pytorch/semantic-segmentation/run_semantic_segmentation_no_trainer.py
View file @
5e8c8eb5
...
...
@@ -441,7 +441,7 @@ def main():
pixel_values
.
append
(
image
)
labels
.
append
(
target
)
encoding
=
dict
()
encoding
=
{}
encoding
[
"pixel_values"
]
=
torch
.
stack
(
pixel_values
)
encoding
[
"labels"
]
=
torch
.
stack
(
labels
)
...
...
@@ -455,7 +455,7 @@ def main():
pixel_values
.
append
(
image
)
labels
.
append
(
target
)
encoding
=
dict
()
encoding
=
{}
encoding
[
"pixel_values"
]
=
torch
.
stack
(
pixel_values
)
encoding
[
"labels"
]
=
torch
.
stack
(
labels
)
...
...
examples/pytorch/speech-recognition/run_speech_recognition_ctc.py
View file @
5e8c8eb5
...
...
@@ -349,7 +349,7 @@ def create_vocabulary_from_data(
lambda
vocab_1
,
vocab_2
:
set
(
vocab_1
[
"vocab"
][
0
])
|
set
(
vocab_2
[
"vocab"
][
0
]),
vocabs
.
values
()
)
vocab_dict
=
{
v
:
k
for
k
,
v
in
enumerate
(
sorted
(
list
(
vocab_set
))
)
}
vocab_dict
=
{
v
:
k
for
k
,
v
in
enumerate
(
sorted
(
vocab_set
))}
# replace white space with delimiter token
if
word_delimiter_token
is
not
None
:
...
...
examples/pytorch/text-classification/run_glue.py
View file @
5e8c8eb5
...
...
@@ -406,12 +406,12 @@ def main():
):
# Some have all caps in their config, some don't.
label_name_to_id
=
{
k
.
lower
():
v
for
k
,
v
in
model
.
config
.
label2id
.
items
()}
if
list
(
sorted
(
label_name_to_id
.
keys
())
)
==
list
(
sorted
(
label_list
)
)
:
if
sorted
(
label_name_to_id
.
keys
())
==
sorted
(
label_list
):
label_to_id
=
{
i
:
int
(
label_name_to_id
[
label_list
[
i
]])
for
i
in
range
(
num_labels
)}
else
:
logger
.
warning
(
"Your model seems to have been trained with labels, but they don't match the dataset: "
,
f
"model labels:
{
list
(
sorted
(
label_name_to_id
.
keys
())
)
}
, dataset labels:
{
list
(
sorted
(
label_list
)
)
}
."
f
"model labels:
{
sorted
(
label_name_to_id
.
keys
())
}
, dataset labels:
{
sorted
(
label_list
)
}
."
"
\n
Ignoring the model labels as a result."
,
)
elif
data_args
.
task_name
is
None
and
not
is_regression
:
...
...
examples/pytorch/text-classification/run_glue_no_trainer.py
View file @
5e8c8eb5
...
...
@@ -339,7 +339,7 @@ def main():
):
# Some have all caps in their config, some don't.
label_name_to_id
=
{
k
.
lower
():
v
for
k
,
v
in
model
.
config
.
label2id
.
items
()}
if
list
(
sorted
(
label_name_to_id
.
keys
())
)
==
list
(
sorted
(
label_list
)
)
:
if
sorted
(
label_name_to_id
.
keys
())
==
sorted
(
label_list
):
logger
.
info
(
f
"The configuration of the model provided the following label correspondence:
{
label_name_to_id
}
. "
"Using it!"
...
...
@@ -348,7 +348,7 @@ def main():
else
:
logger
.
warning
(
"Your model seems to have been trained with labels, but they don't match the dataset: "
,
f
"model labels:
{
list
(
sorted
(
label_name_to_id
.
keys
())
)
}
, dataset labels:
{
list
(
sorted
(
label_list
)
)
}
."
f
"model labels:
{
sorted
(
label_name_to_id
.
keys
())
}
, dataset labels:
{
sorted
(
label_list
)
}
."
"
\n
Ignoring the model labels as a result."
,
)
elif
args
.
task_name
is
None
and
not
is_regression
:
...
...
examples/pytorch/token-classification/run_ner.py
View file @
5e8c8eb5
...
...
@@ -386,7 +386,7 @@ def main():
# Model has labels -> use them.
if
model
.
config
.
label2id
!=
PretrainedConfig
(
num_labels
=
num_labels
).
label2id
:
if
list
(
sorted
(
model
.
config
.
label2id
.
keys
())
)
==
list
(
sorted
(
label_list
)
)
:
if
sorted
(
model
.
config
.
label2id
.
keys
())
==
sorted
(
label_list
):
# Reorganize `label_list` to match the ordering of the model.
if
labels_are_int
:
label_to_id
=
{
i
:
int
(
model
.
config
.
label2id
[
l
])
for
i
,
l
in
enumerate
(
label_list
)}
...
...
@@ -397,8 +397,8 @@ def main():
else
:
logger
.
warning
(
"Your model seems to have been trained with labels, but they don't match the dataset: "
,
f
"model labels:
{
list
(
sorted
(
model
.
config
.
label2id
.
keys
())
)
}
, dataset labels:"
f
"
{
list
(
sorted
(
label_list
)
)
}
.
\n
Ignoring the model labels as a result."
,
f
"model labels:
{
sorted
(
model
.
config
.
label2id
.
keys
())
}
, dataset labels:"
f
"
{
sorted
(
label_list
)
}
.
\n
Ignoring the model labels as a result."
,
)
# Set the correspondences label/ID inside the model config
...
...
examples/pytorch/token-classification/run_ner_no_trainer.py
View file @
5e8c8eb5
...
...
@@ -425,7 +425,7 @@ def main():
# Model has labels -> use them.
if
model
.
config
.
label2id
!=
PretrainedConfig
(
num_labels
=
num_labels
).
label2id
:
if
list
(
sorted
(
model
.
config
.
label2id
.
keys
())
)
==
list
(
sorted
(
label_list
)
)
:
if
sorted
(
model
.
config
.
label2id
.
keys
())
==
sorted
(
label_list
):
# Reorganize `label_list` to match the ordering of the model.
if
labels_are_int
:
label_to_id
=
{
i
:
int
(
model
.
config
.
label2id
[
l
])
for
i
,
l
in
enumerate
(
label_list
)}
...
...
@@ -436,8 +436,8 @@ def main():
else
:
logger
.
warning
(
"Your model seems to have been trained with labels, but they don't match the dataset: "
,
f
"model labels:
{
list
(
sorted
(
model
.
config
.
label2id
.
keys
())
)
}
, dataset labels:"
f
"
{
list
(
sorted
(
label_list
)
)
}
.
\n
Ignoring the model labels as a result."
,
f
"model labels:
{
sorted
(
model
.
config
.
label2id
.
keys
())
}
, dataset labels:"
f
"
{
sorted
(
label_list
)
}
.
\n
Ignoring the model labels as a result."
,
)
# Set the correspondences label/ID inside the model config
...
...
examples/research_projects/bert-loses-patience/run_glue_with_pabee.py
View file @
5e8c8eb5
...
...
@@ -727,9 +727,9 @@ def main():
tokenizer
=
tokenizer_class
.
from_pretrained
(
args
.
output_dir
,
do_lower_case
=
args
.
do_lower_case
)
checkpoints
=
[
args
.
output_dir
]
if
args
.
eval_all_checkpoints
:
checkpoints
=
list
(
checkpoints
=
[
os
.
path
.
dirname
(
c
)
for
c
in
sorted
(
glob
.
glob
(
args
.
output_dir
+
"/**/"
+
WEIGHTS_NAME
,
recursive
=
True
))
)
]
logger
.
info
(
"Evaluate the following checkpoints: %s"
,
checkpoints
)
...
...
@@ -743,7 +743,7 @@ def main():
print
(
f
"Evaluation for checkpoint
{
prefix
}
"
)
for
patience
in
patience_list
:
result
=
evaluate
(
args
,
model
,
tokenizer
,
prefix
=
prefix
,
patience
=
patience
)
result
=
dict
((
k
+
"_{}"
.
format
(
global_step
)
,
v
)
for
k
,
v
in
result
.
items
()
)
result
=
{
k
+
"_{}"
.
format
(
global_step
)
:
v
for
k
,
v
in
result
.
items
()
}
results
.
update
(
result
)
return
results
...
...
examples/research_projects/bertabs/modeling_bertabs.py
View file @
5e8c8eb5
...
...
@@ -54,7 +54,7 @@ class BertAbs(BertAbsPreTrainedModel):
load_bert_pretrained_extractive
=
True
if
bert_extractive_checkpoint
else
False
if
load_bert_pretrained_extractive
:
self
.
bert
.
model
.
load_state_dict
(
dict
([(
n
[
11
:]
,
p
)
for
n
,
p
in
bert_extractive_checkpoint
.
items
()
if
n
.
startswith
(
"bert.model"
)
])
,
{
n
[
11
:]
:
p
for
n
,
p
in
bert_extractive_checkpoint
.
items
()
if
n
.
startswith
(
"bert.model"
)
}
,
strict
=
True
,
)
...
...
examples/research_projects/bertology/run_bertology.py
View file @
5e8c8eb5
...
...
@@ -218,9 +218,9 @@ def prune_heads(args, model, eval_dataloader, head_mask):
original_time
=
datetime
.
now
()
-
before_time
original_num_params
=
sum
(
p
.
numel
()
for
p
in
model
.
parameters
())
heads_to_prune
=
dict
(
(
layer
,
(
1
-
head_mask
[
layer
].
long
()).
nonzero
().
squeeze
().
tolist
()
)
for
layer
in
range
(
len
(
head_mask
))
)
heads_to_prune
=
{
layer
:
(
1
-
head_mask
[
layer
].
long
()).
nonzero
().
squeeze
().
tolist
()
for
layer
in
range
(
len
(
head_mask
))
}
assert
sum
(
len
(
h
)
for
h
in
heads_to_prune
.
values
())
==
(
1
-
head_mask
.
long
()).
sum
().
item
()
model
.
prune_heads
(
heads_to_prune
)
...
...
examples/research_projects/bertology/run_prune_gpt.py
View file @
5e8c8eb5
...
...
@@ -194,9 +194,9 @@ def prune_heads(args, model, eval_dataloader, head_mask):
original_time
=
datetime
.
now
()
-
before_time
original_num_params
=
sum
(
p
.
numel
()
for
p
in
model
.
parameters
())
heads_to_prune
=
dict
(
(
layer
,
(
1
-
head_mask
[
layer
].
long
()).
nonzero
().
squeeze
().
tolist
()
)
for
layer
in
range
(
len
(
head_mask
))
)
heads_to_prune
=
{
layer
:
(
1
-
head_mask
[
layer
].
long
()).
nonzero
().
squeeze
().
tolist
()
for
layer
in
range
(
len
(
head_mask
))
}
for
k
,
v
in
heads_to_prune
.
items
():
if
isinstance
(
v
,
int
):
...
...
examples/research_projects/codeparrot/scripts/minhash_deduplication.py
View file @
5e8c8eb5
...
...
@@ -29,7 +29,7 @@ def get_min_hash(tokens: List[str]) -> Optional[MinHash]:
def
get_tokens
(
code
:
str
)
->
Set
[
str
]:
"""Tokenize a code snippet."""
return
set
([
t
for
t
in
NON_ALPHA
.
split
(
code
)
if
len
(
t
.
strip
())
>
0
])
return
{
t
for
t
in
NON_ALPHA
.
split
(
code
)
if
len
(
t
.
strip
())
>
0
}
class
DuplicationIndex
:
...
...
@@ -243,7 +243,7 @@ def deduplicate_dataset(
>>> ds_dedup, duplicate_clusters = deduplicate_dataset(ds, jaccard_threshold=0.85)
"""
duplicate_clusters
=
make_duplicate_clusters
(
dataset
,
jaccard_threshold
)
duplicate_indices
=
set
(
x
[
"base_index"
]
for
cluster
in
duplicate_clusters
for
x
in
cluster
)
duplicate_indices
=
{
x
[
"base_index"
]
for
cluster
in
duplicate_clusters
for
x
in
cluster
}
extreme_dict
=
{}
extremes_clusters
=
find_extremes
(
duplicate_clusters
,
dataset
,
jaccard_threshold
)
for
extremes
in
extremes_clusters
:
...
...
examples/research_projects/codeparrot/scripts/preprocessing.py
View file @
5e8c8eb5
...
...
@@ -114,7 +114,7 @@ def char_token_ratio(example):
def
preprocess
(
example
):
"""Chain all preprocessing steps into one function to not fill cache."""
results
=
dict
()
results
=
{}
results
.
update
(
get_hash
(
example
))
results
.
update
(
line_stats
(
example
))
results
.
update
(
alpha_stats
(
example
))
...
...
examples/research_projects/codeparrot/scripts/pretokenizing.py
View file @
5e8c8eb5
...
...
@@ -8,7 +8,7 @@ from transformers import AutoTokenizer, HfArgumentParser
def
tokenize
(
example
):
output
=
dict
()
output
=
{}
output
[
"input_ids"
]
=
tokenizer
(
example
[
"content"
],
truncation
=
False
)[
"input_ids"
]
output
[
"ratio_char_token"
]
=
len
(
example
[
"content"
])
/
len
(
output
[
"input_ids"
])
return
output
...
...
Prev
1
2
3
4
5
6
…
12
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment