# This is a BPE, word aware tokenizer, there is a correct way
# to fuse tokens
is_subword=len(word)!=len(word_ref)
else:
# This is a fallback heuristic. This will fail most likely on any kind of text + punctuation mixtures that will be considered "words". Non word aware models cannot do better than this unfortunately.
ifself.aggregation_strategyin{
AggregationStrategy.FIRST,
AggregationStrategy.AVERAGE,
AggregationStrategy.MAX,
}:
warnings.warn(UserWarning,"Tokenizer does not support real words, using fallback heuristic")