"git@developer.sourcefind.cn:chenpangpang/transformers.git" did not exist on "3fd71c4431f2b31eaad737d364e4a4d9bf35fd5b"
Unverified Commit 69e16abf authored by Nicholas Broad's avatar Nicholas Broad Committed by GitHub
Browse files

Switch from using sum for flattening lists of lists in group_texts (#14472)



* remove sum for list flattening

* change to chain(*)

* make chain object a list

* delete empty lines

per sgugger's suggestions
Co-authored-by: default avatarSylvain Gugger <35901082+sgugger@users.noreply.github.com>
Co-authored-by: default avatarNicholas Broad <nicholas@nmbroad.com>
Co-authored-by: default avatarSylvain Gugger <35901082+sgugger@users.noreply.github.com>
parent 0b7d053c
...@@ -27,6 +27,7 @@ import os ...@@ -27,6 +27,7 @@ import os
import sys import sys
import time import time
from dataclasses import dataclass, field from dataclasses import dataclass, field
from itertools import chain
from pathlib import Path from pathlib import Path
from typing import Callable, Optional from typing import Callable, Optional
...@@ -430,7 +431,7 @@ def main(): ...@@ -430,7 +431,7 @@ def main():
# Main data processing function that will concatenate all texts from our dataset and generate chunks of block_size. # Main data processing function that will concatenate all texts from our dataset and generate chunks of block_size.
def group_texts(examples): def group_texts(examples):
# Concatenate all texts. # Concatenate all texts.
concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()} concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}
total_length = len(concatenated_examples[list(examples.keys())[0]]) total_length = len(concatenated_examples[list(examples.keys())[0]])
# We drop the small remainder, we could add padding if the model supported it instead of this drop, you can # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
# customize this part to your needs. # customize this part to your needs.
......
...@@ -25,6 +25,7 @@ import os ...@@ -25,6 +25,7 @@ import os
import sys import sys
import time import time
from dataclasses import dataclass, field from dataclasses import dataclass, field
from itertools import chain
# You can also adapt this script on your own masked language modeling task. Pointers for this are left as comments. # You can also adapt this script on your own masked language modeling task. Pointers for this are left as comments.
from pathlib import Path from pathlib import Path
...@@ -453,7 +454,7 @@ if __name__ == "__main__": ...@@ -453,7 +454,7 @@ if __name__ == "__main__":
# max_seq_length. # max_seq_length.
def group_texts(examples): def group_texts(examples):
# Concatenate all texts. # Concatenate all texts.
concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()} concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}
total_length = len(concatenated_examples[list(examples.keys())[0]]) total_length = len(concatenated_examples[list(examples.keys())[0]])
# We drop the small remainder, we could add padding if the model supported it instead of this drop, you can # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
# customize this part to your needs. # customize this part to your needs.
......
...@@ -25,6 +25,7 @@ import os ...@@ -25,6 +25,7 @@ import os
import sys import sys
import time import time
from dataclasses import dataclass, field from dataclasses import dataclass, field
from itertools import chain
from pathlib import Path from pathlib import Path
from typing import Dict, List, Optional from typing import Dict, List, Optional
...@@ -563,7 +564,7 @@ if __name__ == "__main__": ...@@ -563,7 +564,7 @@ if __name__ == "__main__":
# Main data processing function that will concatenate all texts from our dataset and generate chunks of expanded_inputs_length. # Main data processing function that will concatenate all texts from our dataset and generate chunks of expanded_inputs_length.
def group_texts(examples): def group_texts(examples):
# Concatenate all texts. # Concatenate all texts.
concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()} concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}
total_length = len(concatenated_examples[list(examples.keys())[0]]) total_length = len(concatenated_examples[list(examples.keys())[0]])
# We drop the small remainder, we could add padding if the model supported it instead of this drop, you can # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
# customize this part to your needs. # customize this part to your needs.
......
...@@ -26,6 +26,7 @@ import math ...@@ -26,6 +26,7 @@ import math
import os import os
import sys import sys
from dataclasses import dataclass, field from dataclasses import dataclass, field
from itertools import chain
from typing import Optional from typing import Optional
import datasets import datasets
...@@ -408,7 +409,7 @@ def main(): ...@@ -408,7 +409,7 @@ def main():
# Main data processing function that will concatenate all texts from our dataset and generate chunks of block_size. # Main data processing function that will concatenate all texts from our dataset and generate chunks of block_size.
def group_texts(examples): def group_texts(examples):
# Concatenate all texts. # Concatenate all texts.
concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()} concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}
total_length = len(concatenated_examples[list(examples.keys())[0]]) total_length = len(concatenated_examples[list(examples.keys())[0]])
# We drop the small remainder, we could add padding if the model supported it instead of this drop, you can # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
# customize this part to your needs. # customize this part to your needs.
......
...@@ -27,6 +27,7 @@ import logging ...@@ -27,6 +27,7 @@ import logging
import math import math
import os import os
import random import random
from itertools import chain
from pathlib import Path from pathlib import Path
import datasets import datasets
...@@ -366,7 +367,7 @@ def main(): ...@@ -366,7 +367,7 @@ def main():
# Main data processing function that will concatenate all texts from our dataset and generate chunks of block_size. # Main data processing function that will concatenate all texts from our dataset and generate chunks of block_size.
def group_texts(examples): def group_texts(examples):
# Concatenate all texts. # Concatenate all texts.
concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()} concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}
total_length = len(concatenated_examples[list(examples.keys())[0]]) total_length = len(concatenated_examples[list(examples.keys())[0]])
# We drop the small remainder, we could add padding if the model supported it instead of this drop, you can # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
# customize this part to your needs. # customize this part to your needs.
......
...@@ -26,6 +26,7 @@ import math ...@@ -26,6 +26,7 @@ import math
import os import os
import sys import sys
from dataclasses import dataclass, field from dataclasses import dataclass, field
from itertools import chain
from typing import Optional from typing import Optional
import datasets import datasets
...@@ -432,7 +433,7 @@ def main(): ...@@ -432,7 +433,7 @@ def main():
# max_seq_length. # max_seq_length.
def group_texts(examples): def group_texts(examples):
# Concatenate all texts. # Concatenate all texts.
concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()} concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}
total_length = len(concatenated_examples[list(examples.keys())[0]]) total_length = len(concatenated_examples[list(examples.keys())[0]])
# We drop the small remainder, we could add padding if the model supported it instead of this drop, you can # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
# customize this part to your needs. # customize this part to your needs.
......
...@@ -27,6 +27,7 @@ import logging ...@@ -27,6 +27,7 @@ import logging
import math import math
import os import os
import random import random
from itertools import chain
from pathlib import Path from pathlib import Path
import datasets import datasets
...@@ -406,7 +407,7 @@ def main(): ...@@ -406,7 +407,7 @@ def main():
# max_seq_length. # max_seq_length.
def group_texts(examples): def group_texts(examples):
# Concatenate all texts. # Concatenate all texts.
concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()} concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}
total_length = len(concatenated_examples[list(examples.keys())[0]]) total_length = len(concatenated_examples[list(examples.keys())[0]])
# We drop the small remainder, we could add padding if the model supported it instead of this drop, you can # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
# customize this part to your needs. # customize this part to your needs.
......
...@@ -23,6 +23,7 @@ import math ...@@ -23,6 +23,7 @@ import math
import os import os
import sys import sys
from dataclasses import dataclass, field from dataclasses import dataclass, field
from itertools import chain
from typing import Optional from typing import Optional
import datasets import datasets
...@@ -403,7 +404,7 @@ def main(): ...@@ -403,7 +404,7 @@ def main():
# max_seq_length. # max_seq_length.
def group_texts(examples): def group_texts(examples):
# Concatenate all texts. # Concatenate all texts.
concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()} concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}
total_length = len(concatenated_examples[list(examples.keys())[0]]) total_length = len(concatenated_examples[list(examples.keys())[0]])
# We drop the small remainder, we could add padding if the model supported it instead of this drop, you can # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
# customize this part to your needs. # customize this part to your needs.
......
...@@ -22,6 +22,7 @@ import logging ...@@ -22,6 +22,7 @@ import logging
import os import os
import sys import sys
from dataclasses import dataclass, field from dataclasses import dataclass, field
from itertools import chain
from typing import Optional, Union from typing import Optional, Union
import datasets import datasets
...@@ -185,7 +186,7 @@ class DataCollatorForMultipleChoice: ...@@ -185,7 +186,7 @@ class DataCollatorForMultipleChoice:
flattened_features = [ flattened_features = [
[{k: v[i] for k, v in feature.items()} for i in range(num_choices)] for feature in features [{k: v[i] for k, v in feature.items()} for i in range(num_choices)] for feature in features
] ]
flattened_features = sum(flattened_features, []) flattened_features = list(chain(*flattened_features))
batch = self.tokenizer.pad( batch = self.tokenizer.pad(
flattened_features, flattened_features,
...@@ -333,8 +334,8 @@ def main(): ...@@ -333,8 +334,8 @@ def main():
] ]
# Flatten out # Flatten out
first_sentences = sum(first_sentences, []) first_sentences = list(chain(*first_sentences))
second_sentences = sum(second_sentences, []) second_sentences = list(chain(*second_sentences))
# Tokenize # Tokenize
tokenized_examples = tokenizer( tokenized_examples = tokenizer(
......
...@@ -24,6 +24,7 @@ import math ...@@ -24,6 +24,7 @@ import math
import os import os
import random import random
from dataclasses import dataclass from dataclasses import dataclass
from itertools import chain
from pathlib import Path from pathlib import Path
from typing import Optional, Union from typing import Optional, Union
...@@ -224,7 +225,7 @@ class DataCollatorForMultipleChoice: ...@@ -224,7 +225,7 @@ class DataCollatorForMultipleChoice:
flattened_features = [ flattened_features = [
[{k: v[i] for k, v in feature.items()} for i in range(num_choices)] for feature in features [{k: v[i] for k, v in feature.items()} for i in range(num_choices)] for feature in features
] ]
flattened_features = sum(flattened_features, []) flattened_features = list(chain(*flattened_features))
batch = self.tokenizer.pad( batch = self.tokenizer.pad(
flattened_features, flattened_features,
...@@ -365,8 +366,8 @@ def main(): ...@@ -365,8 +366,8 @@ def main():
labels = examples[label_column_name] labels = examples[label_column_name]
# Flatten out # Flatten out
first_sentences = sum(first_sentences, []) first_sentences = list(chain(*first_sentences))
second_sentences = sum(second_sentences, []) second_sentences = list(chain(*second_sentences))
# Tokenize # Tokenize
tokenized_examples = tokenizer( tokenized_examples = tokenizer(
......
...@@ -23,6 +23,7 @@ import os ...@@ -23,6 +23,7 @@ import os
import sys import sys
import time import time
from dataclasses import dataclass, field from dataclasses import dataclass, field
from itertools import chain
from pathlib import Path from pathlib import Path
from typing import Callable, Optional from typing import Callable, Optional
...@@ -364,7 +365,7 @@ def main(): ...@@ -364,7 +365,7 @@ def main():
# Main data processing function that will concatenate all texts from our dataset and generate chunks of block_size. # Main data processing function that will concatenate all texts from our dataset and generate chunks of block_size.
def group_texts(examples): def group_texts(examples):
# Concatenate all texts. # Concatenate all texts.
concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()} concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}
total_length = len(concatenated_examples[list(examples.keys())[0]]) total_length = len(concatenated_examples[list(examples.keys())[0]])
# We drop the small remainder, we could add padding if the model supported it instead of this drop, you can # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
# customize this part to your needs. # customize this part to your needs.
......
...@@ -30,6 +30,7 @@ import random ...@@ -30,6 +30,7 @@ import random
import sys import sys
from dataclasses import dataclass, field from dataclasses import dataclass, field
from functools import partial from functools import partial
from itertools import chain
from pathlib import Path from pathlib import Path
from typing import Optional from typing import Optional
...@@ -406,7 +407,7 @@ def main(): ...@@ -406,7 +407,7 @@ def main():
# Main data processing function that will concatenate all texts from our dataset and generate chunks of block_size. # Main data processing function that will concatenate all texts from our dataset and generate chunks of block_size.
def group_texts(examples): def group_texts(examples):
# Concatenate all texts. # Concatenate all texts.
concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()} concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}
total_length = len(concatenated_examples[list(examples.keys())[0]]) total_length = len(concatenated_examples[list(examples.keys())[0]])
# We drop the small remainder, we could add padding if the model supported it instead of this drop, you can # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
# customize this part to your needs. # customize this part to your needs.
......
...@@ -32,6 +32,7 @@ import random ...@@ -32,6 +32,7 @@ import random
import sys import sys
from dataclasses import dataclass, field from dataclasses import dataclass, field
from functools import partial from functools import partial
from itertools import chain
from pathlib import Path from pathlib import Path
from typing import Optional from typing import Optional
...@@ -462,7 +463,7 @@ def main(): ...@@ -462,7 +463,7 @@ def main():
# max_seq_length. # max_seq_length.
def group_texts(examples): def group_texts(examples):
# Concatenate all texts. # Concatenate all texts.
concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()} concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}
total_length = len(concatenated_examples[list(examples.keys())[0]]) total_length = len(concatenated_examples[list(examples.keys())[0]])
# We drop the small remainder, we could add padding if the model supported it instead of this drop, you can # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
# customize this part to your needs. # customize this part to your needs.
......
...@@ -22,6 +22,7 @@ import logging ...@@ -22,6 +22,7 @@ import logging
import os import os
import sys import sys
from dataclasses import dataclass, field from dataclasses import dataclass, field
from itertools import chain
from pathlib import Path from pathlib import Path
from typing import Optional from typing import Optional
...@@ -342,8 +343,8 @@ def main(): ...@@ -342,8 +343,8 @@ def main():
] ]
# Flatten out # Flatten out
first_sentences = sum(first_sentences, []) first_sentences = list(chain(*first_sentences))
second_sentences = sum(second_sentences, []) second_sentences = list(chain(*second_sentences))
# Tokenize # Tokenize
tokenized_examples = tokenizer(first_sentences, second_sentences, truncation=True, max_length=max_seq_length) tokenized_examples = tokenizer(first_sentences, second_sentences, truncation=True, max_length=max_seq_length)
......
...@@ -35,6 +35,7 @@ from dataclasses import fields ...@@ -35,6 +35,7 @@ from dataclasses import fields
from enum import Enum from enum import Enum
from functools import partial, wraps from functools import partial, wraps
from hashlib import sha256 from hashlib import sha256
from itertools import chain
from pathlib import Path from pathlib import Path
from types import ModuleType from types import ModuleType
from typing import Any, BinaryIO, ContextManager, Dict, List, Optional, Tuple, Union from typing import Any, BinaryIO, ContextManager, Dict, List, Optional, Tuple, Union
...@@ -2129,7 +2130,7 @@ class _LazyModule(ModuleType): ...@@ -2129,7 +2130,7 @@ class _LazyModule(ModuleType):
for value in values: for value in values:
self._class_to_module[value] = key self._class_to_module[value] = key
# Needed for autocompletion in an IDE # Needed for autocompletion in an IDE
self.__all__ = list(import_structure.keys()) + sum(import_structure.values(), []) self.__all__ = list(import_structure.keys()) + list(chain(*import_structure.values()))
self.__file__ = module_file self.__file__ = module_file
self.__spec__ = module_spec self.__spec__ = module_spec
self.__path__ = [os.path.dirname(module_file)] self.__path__ = [os.path.dirname(module_file)]
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment