Unverified Commit 62d30e05 authored by Sylvain Gugger's avatar Sylvain Gugger Committed by GitHub
Browse files

Small fix to the run clm script (#8973)

parent 28fa014a
...@@ -102,8 +102,8 @@ class DataTrainingArguments: ...@@ -102,8 +102,8 @@ class DataTrainingArguments:
default=None, default=None,
metadata={"help": "An optional input evaluation data file to evaluate the perplexity on (a text file)."}, metadata={"help": "An optional input evaluation data file to evaluate the perplexity on (a text file)."},
) )
block_size: int = field( block_size: Optional[int] = field(
default=-1, default=None,
metadata={ metadata={
"help": "Optional input sequence length after tokenization." "help": "Optional input sequence length after tokenization."
"The training dataset will be truncated in block of this size for training." "The training dataset will be truncated in block of this size for training."
...@@ -261,8 +261,14 @@ def main(): ...@@ -261,8 +261,14 @@ def main():
load_from_cache_file=not data_args.overwrite_cache, load_from_cache_file=not data_args.overwrite_cache,
) )
if data_args.block_size <= 0: if data_args.block_size is None:
block_size = tokenizer.model_max_length block_size = tokenizer.model_max_length
if block_size > 1024:
logger.warn(
f"The tokenizer picked seems to have a very large `model_max_length` ({tokenizer.model_max_length}). "
"Picking 1024 instead. You can change that default value by passing --block_size xxx."
)
block_size = 1024
else: else:
if data_args.block_size > tokenizer.model_max_length: if data_args.block_size > tokenizer.model_max_length:
logger.warn( logger.warn(
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment