"...git@developer.sourcefind.cn:chenpangpang/transformers.git" did not exist on "d25e25ee2b63ebfcd099deb689a5a7272574a10f"
Unverified Commit a1ad16a4 authored by Sylvain Gugger's avatar Sylvain Gugger Committed by GitHub
Browse files

Restrain tokenizer.model_max_length default (#9681)

* Restrain tokenizer.model_max_length default

* Fix indent
parent 7e662e6a
...@@ -338,6 +338,12 @@ def main(): ...@@ -338,6 +338,12 @@ def main():
if data_args.max_seq_length is None: if data_args.max_seq_length is None:
max_seq_length = tokenizer.model_max_length max_seq_length = tokenizer.model_max_length
if max_seq_length > 1024:
logger.warn(
f"The tokenizer picked seems to have a very large `model_max_length` ({tokenizer.model_max_length}). "
"Picking 1024 instead. You can change that default value by passing --max_seq_length xxx."
)
max_seq_length = 1024
else: else:
if data_args.max_seq_length > tokenizer.model_max_length: if data_args.max_seq_length > tokenizer.model_max_length:
logger.warn( logger.warn(
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment