Commit b3506629 authored by Julien Chaumond's avatar Julien Chaumond
Browse files

overflowing_tokens do not really make sense here, let's just return a number


Co-Authored-By: default avatarLysandre Debut <lysandre.debut@reseau.eseo.fr>
parent f5bcde0b
...@@ -338,7 +338,7 @@ def convert_examples_to_features( ...@@ -338,7 +338,7 @@ def convert_examples_to_features(
max_length=max_length, max_length=max_length,
truncate_both_sequences=True truncate_both_sequences=True
) )
if 'overflowing_tokens' in inputs and len(inputs['overflowing_tokens']) > 0: if 'num_truncated_tokens' in inputs and inputs['num_truncated_tokens'] > 0:
logger.info('Attention! you are cropping tokens (swag task is ok). ' logger.info('Attention! you are cropping tokens (swag task is ok). '
'If you are training ARC and RACE and you are poping question + options,' 'If you are training ARC and RACE and you are poping question + options,'
'you need to try to use a bigger max seq length!') 'you need to try to use a bigger max seq length!')
......
...@@ -837,7 +837,8 @@ class PreTrainedTokenizer(object): ...@@ -837,7 +837,8 @@ class PreTrainedTokenizer(object):
copy.deepcopy(pair_ids), copy.deepcopy(pair_ids),
max_length=max_length - n_added_tokens max_length=max_length - n_added_tokens
) )
encoded_inputs["overflowing_tokens"] = ids[- (len_ids - len(tokens_a)):] + pair_ids[- (len_pair_ids - len(tokens_b)):] truncated_tokens = ids[- (len_ids - len(tokens_a)):] + pair_ids[- (len_pair_ids - len(tokens_b)):]
encoded_inputs["num_truncated_tokens"] = len(truncated_tokens)
ids = tokens_a ids = tokens_a
pair_ids = tokens_b pair_ids = tokens_b
elif pair and n_added_tokens + (len_pair_ids if truncate_first_sequence else len_ids) >= max_length: elif pair and n_added_tokens + (len_pair_ids if truncate_first_sequence else len_ids) >= max_length:
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment