"docs/source/vscode:/vscode.git/clone" did not exist on "eb30a49b2028f2411514c10e432792ad581fc08b"
Unverified Commit 69e16abf authored by Nicholas Broad's avatar Nicholas Broad Committed by GitHub
Browse files

Switch from using sum for flattening lists of lists in group_texts (#14472)



* remove sum for list flattening

* change to chain(*)

* make chain object a list

* delete empty lines

per sgugger's suggestions
Co-authored-by: default avatarSylvain Gugger <35901082+sgugger@users.noreply.github.com>
Co-authored-by: default avatarNicholas Broad <nicholas@nmbroad.com>
Co-authored-by: default avatarSylvain Gugger <35901082+sgugger@users.noreply.github.com>
parent 0b7d053c
......@@ -27,6 +27,7 @@ import os
import sys
import time
from dataclasses import dataclass, field
from itertools import chain
from pathlib import Path
from typing import Callable, Optional
......@@ -430,7 +431,7 @@ def main():
# Main data processing function that will concatenate all texts from our dataset and generate chunks of block_size.
def group_texts(examples):
# Concatenate all texts.
concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}
total_length = len(concatenated_examples[list(examples.keys())[0]])
# We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
# customize this part to your needs.
......
......@@ -25,6 +25,7 @@ import os
import sys
import time
from dataclasses import dataclass, field
from itertools import chain
# You can also adapt this script on your own masked language modeling task. Pointers for this are left as comments.
from pathlib import Path
......@@ -453,7 +454,7 @@ if __name__ == "__main__":
# max_seq_length.
def group_texts(examples):
# Concatenate all texts.
concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}
total_length = len(concatenated_examples[list(examples.keys())[0]])
# We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
# customize this part to your needs.
......
......@@ -25,6 +25,7 @@ import os
import sys
import time
from dataclasses import dataclass, field
from itertools import chain
from pathlib import Path
from typing import Dict, List, Optional
......@@ -563,7 +564,7 @@ if __name__ == "__main__":
# Main data processing function that will concatenate all texts from our dataset and generate chunks of expanded_inputs_length.
def group_texts(examples):
# Concatenate all texts.
concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}
total_length = len(concatenated_examples[list(examples.keys())[0]])
# We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
# customize this part to your needs.
......
......@@ -26,6 +26,7 @@ import math
import os
import sys
from dataclasses import dataclass, field
from itertools import chain
from typing import Optional
import datasets
......@@ -408,7 +409,7 @@ def main():
# Main data processing function that will concatenate all texts from our dataset and generate chunks of block_size.
def group_texts(examples):
# Concatenate all texts.
concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}
total_length = len(concatenated_examples[list(examples.keys())[0]])
# We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
# customize this part to your needs.
......
......@@ -27,6 +27,7 @@ import logging
import math
import os
import random
from itertools import chain
from pathlib import Path
import datasets
......@@ -366,7 +367,7 @@ def main():
# Main data processing function that will concatenate all texts from our dataset and generate chunks of block_size.
def group_texts(examples):
# Concatenate all texts.
concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}
total_length = len(concatenated_examples[list(examples.keys())[0]])
# We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
# customize this part to your needs.
......
......@@ -26,6 +26,7 @@ import math
import os
import sys
from dataclasses import dataclass, field
from itertools import chain
from typing import Optional
import datasets
......@@ -432,7 +433,7 @@ def main():
# max_seq_length.
def group_texts(examples):
# Concatenate all texts.
concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}
total_length = len(concatenated_examples[list(examples.keys())[0]])
# We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
# customize this part to your needs.
......
......@@ -27,6 +27,7 @@ import logging
import math
import os
import random
from itertools import chain
from pathlib import Path
import datasets
......@@ -406,7 +407,7 @@ def main():
# max_seq_length.
def group_texts(examples):
# Concatenate all texts.
concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}
total_length = len(concatenated_examples[list(examples.keys())[0]])
# We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
# customize this part to your needs.
......
......@@ -23,6 +23,7 @@ import math
import os
import sys
from dataclasses import dataclass, field
from itertools import chain
from typing import Optional
import datasets
......@@ -403,7 +404,7 @@ def main():
# max_seq_length.
def group_texts(examples):
# Concatenate all texts.
concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}
total_length = len(concatenated_examples[list(examples.keys())[0]])
# We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
# customize this part to your needs.
......
......@@ -22,6 +22,7 @@ import logging
import os
import sys
from dataclasses import dataclass, field
from itertools import chain
from typing import Optional, Union
import datasets
......@@ -185,7 +186,7 @@ class DataCollatorForMultipleChoice:
flattened_features = [
[{k: v[i] for k, v in feature.items()} for i in range(num_choices)] for feature in features
]
flattened_features = sum(flattened_features, [])
flattened_features = list(chain(*flattened_features))
batch = self.tokenizer.pad(
flattened_features,
......@@ -333,8 +334,8 @@ def main():
]
# Flatten out
first_sentences = sum(first_sentences, [])
second_sentences = sum(second_sentences, [])
first_sentences = list(chain(*first_sentences))
second_sentences = list(chain(*second_sentences))
# Tokenize
tokenized_examples = tokenizer(
......
......@@ -24,6 +24,7 @@ import math
import os
import random
from dataclasses import dataclass
from itertools import chain
from pathlib import Path
from typing import Optional, Union
......@@ -224,7 +225,7 @@ class DataCollatorForMultipleChoice:
flattened_features = [
[{k: v[i] for k, v in feature.items()} for i in range(num_choices)] for feature in features
]
flattened_features = sum(flattened_features, [])
flattened_features = list(chain(*flattened_features))
batch = self.tokenizer.pad(
flattened_features,
......@@ -365,8 +366,8 @@ def main():
labels = examples[label_column_name]
# Flatten out
first_sentences = sum(first_sentences, [])
second_sentences = sum(second_sentences, [])
first_sentences = list(chain(*first_sentences))
second_sentences = list(chain(*second_sentences))
# Tokenize
tokenized_examples = tokenizer(
......
......@@ -23,6 +23,7 @@ import os
import sys
import time
from dataclasses import dataclass, field
from itertools import chain
from pathlib import Path
from typing import Callable, Optional
......@@ -364,7 +365,7 @@ def main():
# Main data processing function that will concatenate all texts from our dataset and generate chunks of block_size.
def group_texts(examples):
# Concatenate all texts.
concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}
total_length = len(concatenated_examples[list(examples.keys())[0]])
# We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
# customize this part to your needs.
......
......@@ -30,6 +30,7 @@ import random
import sys
from dataclasses import dataclass, field
from functools import partial
from itertools import chain
from pathlib import Path
from typing import Optional
......@@ -406,7 +407,7 @@ def main():
# Main data processing function that will concatenate all texts from our dataset and generate chunks of block_size.
def group_texts(examples):
# Concatenate all texts.
concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}
total_length = len(concatenated_examples[list(examples.keys())[0]])
# We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
# customize this part to your needs.
......
......@@ -32,6 +32,7 @@ import random
import sys
from dataclasses import dataclass, field
from functools import partial
from itertools import chain
from pathlib import Path
from typing import Optional
......@@ -462,7 +463,7 @@ def main():
# max_seq_length.
def group_texts(examples):
# Concatenate all texts.
concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}
total_length = len(concatenated_examples[list(examples.keys())[0]])
# We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
# customize this part to your needs.
......
......@@ -22,6 +22,7 @@ import logging
import os
import sys
from dataclasses import dataclass, field
from itertools import chain
from pathlib import Path
from typing import Optional
......@@ -342,8 +343,8 @@ def main():
]
# Flatten out
first_sentences = sum(first_sentences, [])
second_sentences = sum(second_sentences, [])
first_sentences = list(chain(*first_sentences))
second_sentences = list(chain(*second_sentences))
# Tokenize
tokenized_examples = tokenizer(first_sentences, second_sentences, truncation=True, max_length=max_seq_length)
......
......@@ -35,6 +35,7 @@ from dataclasses import fields
from enum import Enum
from functools import partial, wraps
from hashlib import sha256
from itertools import chain
from pathlib import Path
from types import ModuleType
from typing import Any, BinaryIO, ContextManager, Dict, List, Optional, Tuple, Union
......@@ -2129,7 +2130,7 @@ class _LazyModule(ModuleType):
for value in values:
self._class_to_module[value] = key
# Needed for autocompletion in an IDE
self.__all__ = list(import_structure.keys()) + sum(import_structure.values(), [])
self.__all__ = list(import_structure.keys()) + list(chain(*import_structure.values()))
self.__file__ = module_file
self.__spec__ = module_spec
self.__path__ = [os.path.dirname(module_file)]
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment