# 3. Now we can instantiate the feature extractor, tokenizers and model
# 4. Now we can instantiate the feature extractor, tokenizers and model
# Note for distributed training, the .from_pretrained methods guarantee that only
# Note for distributed training, the .from_pretrained methods guarantee that only
# one local process can concurrently download model & vocab.
# one local process can concurrently download model & vocab.
...
@@ -692,6 +699,7 @@ def main():
...
@@ -692,6 +699,7 @@ def main():
cache_dir=model_args.cache_dir,
cache_dir=model_args.cache_dir,
token=data_args.token,
token=data_args.token,
trust_remote_code=data_args.trust_remote_code,
trust_remote_code=data_args.trust_remote_code,
use_fast=model_args.use_fast_tokenizer,
)
)
# load description tokenizer
# load description tokenizer
...
@@ -700,8 +708,15 @@ def main():
...
@@ -700,8 +708,15 @@ def main():
cache_dir=model_args.cache_dir,
cache_dir=model_args.cache_dir,
token=data_args.token,
token=data_args.token,
trust_remote_code=data_args.trust_remote_code,
trust_remote_code=data_args.trust_remote_code,
use_fast=model_args.use_fast_tokenizer,
)
)
ifmodel_args.use_fast_tokenizer:
logger.warning("Disabling fast tokenizer warning: https://github.com/huggingface/transformers/blob/main/src/transformers/tokenization_utils_base.py#L3231-L3235")