"tests/ctrl/test_tokenization_ctrl.py" did not exist on "03c2c762a6cdf0c2b4a424385ba298a95ff12177"
Unverified Commit a14b055b authored by Albert Villanova del Moral's avatar Albert Villanova del Moral Committed by GitHub
Browse files

Pass datasets trust_remote_code (#31406)

* Pass datasets trust_remote_code

* Pass trust_remote_code in more tests

* Add trust_remote_dataset_code arg to some tests

* Revert "Temporarily pin datasets upper version to fix CI"

This reverts commit b7672826.

* Pass trust_remote_code in librispeech_asr_dummy docstrings

* Revert "Pin datasets<2.20.0 for examples"

This reverts commit 833fc17a.

* Pass trust_remote_code to all examples

* Revert "Add trust_remote_dataset_code arg to some tests" to research_projects

* Pass trust_remote_code to tests

* Pass trust_remote_code to docstrings

* Fix flax examples tests requirements

* Pass trust_remote_dataset_code arg to tests

* Replace trust_remote_dataset_code with trust_remote_code in one example

* Fix duplicate trust_remote_code

* Replace args.trust_remote_dataset_code with args.trust_remote_code

* Replace trust_remote_dataset_code with trust_remote_code in parser

* Replace trust_remote_dataset_code with trust_remote_code in dataclasses

* Replace trust_remote_dataset_code with trust_remote_code arg
parent 485fd814
...@@ -112,9 +112,9 @@ class ModelArguments: ...@@ -112,9 +112,9 @@ class ModelArguments:
default=False, default=False,
metadata={ metadata={
"help": ( "help": (
"Whether or not to allow for custom models defined on the Hub in their own modeling files. This option " "Whether to trust the execution of code from datasets/models defined on the Hub."
"should only be set to `True` for repositories you trust and in which you have read the code, as it will " " This option should only be set to `True` for repositories you trust and in which you have read the"
"execute code present on the Hub on your local machine." " code, as it will execute code present on the Hub on your local machine."
) )
}, },
) )
...@@ -366,6 +366,7 @@ def main(): ...@@ -366,6 +366,7 @@ def main():
data_args.dataset_config_name, data_args.dataset_config_name,
cache_dir=model_args.cache_dir, cache_dir=model_args.cache_dir,
token=model_args.token, token=model_args.token,
trust_remote_code=model_args.trust_remote_code,
) )
else: else:
data_files = {} data_files = {}
......
...@@ -316,6 +316,7 @@ class ExamplesTests(TestCasePlus): ...@@ -316,6 +316,7 @@ class ExamplesTests(TestCasePlus):
testargs = f""" testargs = f"""
run_image_classification.py run_image_classification.py
--dataset_name hf-internal-testing/cats_vs_dogs_sample --dataset_name hf-internal-testing/cats_vs_dogs_sample
--trust_remote_code
--model_name_or_path microsoft/resnet-18 --model_name_or_path microsoft/resnet-18
--do_train --do_train
--do_eval --do_eval
......
...@@ -88,9 +88,9 @@ class ModelArguments: ...@@ -88,9 +88,9 @@ class ModelArguments:
default=False, default=False,
metadata={ metadata={
"help": ( "help": (
"Whether or not to allow for custom models defined on the Hub in their own modeling files. This option " "Whether to trust the execution of code from datasets/models defined on the Hub."
"should only be set to `True` for repositories you trust and in which you have read the code, as it will " " This option should only be set to `True` for repositories you trust and in which you have read the"
"execute code present on the Hub on your local machine." " code, as it will execute code present on the Hub on your local machine."
) )
}, },
) )
...@@ -239,6 +239,7 @@ def main(): ...@@ -239,6 +239,7 @@ def main():
data_args.dataset_name, data_args.dataset_name,
data_args.dataset_config_name, data_args.dataset_config_name,
token=model_args.token, token=model_args.token,
trust_remote_code=model_args.trust_remote_code,
) )
else: else:
data_files = {} data_files = {}
......
...@@ -106,9 +106,9 @@ class ModelArguments: ...@@ -106,9 +106,9 @@ class ModelArguments:
default=False, default=False,
metadata={ metadata={
"help": ( "help": (
"Whether or not to allow for custom models defined on the Hub in their own modeling files. This option " "Whether to trust the execution of code from datasets/models defined on the Hub."
"should only be set to `True` for repositories you trust and in which you have read the code, as it will " " This option should only be set to `True` for repositories you trust and in which you have read the"
"execute code present on the Hub on your local machine." " code, as it will execute code present on the Hub on your local machine."
) )
}, },
) )
...@@ -333,6 +333,7 @@ def main(): ...@@ -333,6 +333,7 @@ def main():
data_args.dataset_config_name, data_args.dataset_config_name,
cache_dir=model_args.cache_dir, cache_dir=model_args.cache_dir,
token=model_args.token, token=model_args.token,
trust_remote_code=model_args.trust_remote_code,
) )
else: else:
data_files = {} data_files = {}
......
...@@ -13,7 +13,7 @@ TOKENIZER_CLASSES = { ...@@ -13,7 +13,7 @@ TOKENIZER_CLASSES = {
name: (getattr(transformers, name), getattr(transformers, name + "Fast")) for name in SLOW_TO_FAST_CONVERTERS name: (getattr(transformers, name), getattr(transformers, name + "Fast")) for name in SLOW_TO_FAST_CONVERTERS
} }
dataset = datasets.load_dataset("xnli", split="test+validation") dataset = datasets.load_dataset("facebook/xnli", split="test+validation") # no-script
total = 0 total = 0
perfect = 0 perfect = 0
......
...@@ -102,7 +102,7 @@ _deps = [ ...@@ -102,7 +102,7 @@ _deps = [
"codecarbon==1.2.0", "codecarbon==1.2.0",
"cookiecutter==1.7.3", "cookiecutter==1.7.3",
"dataclasses", "dataclasses",
"datasets!=2.5.0,<2.20.0", # Temporary upper version "datasets!=2.5.0",
"decord==0.6.0", "decord==0.6.0",
"deepspeed>=0.9.3", "deepspeed>=0.9.3",
"diffusers", "diffusers",
......
...@@ -51,7 +51,9 @@ class TextToSpeechTool(PipelineTool): ...@@ -51,7 +51,9 @@ class TextToSpeechTool(PipelineTool):
if not is_datasets_available(): if not is_datasets_available():
raise ImportError("Datasets needs to be installed if not passing speaker embeddings.") raise ImportError("Datasets needs to be installed if not passing speaker embeddings.")
embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation") embeddings_dataset = load_dataset(
"Matthijs/cmu-arctic-xvectors", split="validation", trust_remote_code=True
)
speaker_embeddings = torch.tensor(embeddings_dataset[7305]["xvector"]).unsqueeze(0) speaker_embeddings = torch.tensor(embeddings_dataset[7305]["xvector"]).unsqueeze(0)
return {"input_ids": inputs["input_ids"], "speaker_embeddings": speaker_embeddings} return {"input_ids": inputs["input_ids"], "speaker_embeddings": speaker_embeddings}
......
...@@ -202,7 +202,9 @@ class PTtoTFCommand(BaseTransformersCLICommand): ...@@ -202,7 +202,9 @@ class PTtoTFCommand(BaseTransformersCLICommand):
""" """
def _get_audio_input(): def _get_audio_input():
ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") ds = load_dataset(
"hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True
)
speech_samples = ds.sort("id").select(range(2))[:2]["audio"] speech_samples = ds.sort("id").select(range(2))[:2]["audio"]
raw_samples = [x["array"] for x in speech_samples] raw_samples = [x["array"] for x in speech_samples]
return raw_samples return raw_samples
...@@ -234,7 +236,7 @@ class PTtoTFCommand(BaseTransformersCLICommand): ...@@ -234,7 +236,7 @@ class PTtoTFCommand(BaseTransformersCLICommand):
} }
) )
if "pixel_values" in model_forward_signature: if "pixel_values" in model_forward_signature:
sample_images = load_dataset("cifar10", "plain_text", split="test")[:2]["img"] sample_images = load_dataset("uoft-cs/cifar10", "plain_text", split="test")[:2]["img"] # no-script
processor_inputs.update({"images": sample_images}) processor_inputs.update({"images": sample_images})
if "input_features" in model_forward_signature: if "input_features" in model_forward_signature:
feature_extractor_signature = inspect.signature(processor.feature_extractor).parameters feature_extractor_signature = inspect.signature(processor.feature_extractor).parameters
......
...@@ -9,7 +9,7 @@ deps = { ...@@ -9,7 +9,7 @@ deps = {
"codecarbon": "codecarbon==1.2.0", "codecarbon": "codecarbon==1.2.0",
"cookiecutter": "cookiecutter==1.7.3", "cookiecutter": "cookiecutter==1.7.3",
"dataclasses": "dataclasses", "dataclasses": "dataclasses",
"datasets": "datasets!=2.5.0,<2.20.0", "datasets": "datasets!=2.5.0",
"decord": "decord==0.6.0", "decord": "decord==0.6.0",
"deepspeed": "deepspeed>=0.9.3", "deepspeed": "deepspeed>=0.9.3",
"diffusers": "diffusers", "diffusers": "diffusers",
......
...@@ -1760,7 +1760,7 @@ class SuppressTokensAtBeginLogitsProcessor(LogitsProcessor): ...@@ -1760,7 +1760,7 @@ class SuppressTokensAtBeginLogitsProcessor(LogitsProcessor):
>>> processor = AutoProcessor.from_pretrained("openai/whisper-tiny.en") >>> processor = AutoProcessor.from_pretrained("openai/whisper-tiny.en")
>>> model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny.en") >>> model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny.en")
>>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True)
>>> inputs = processor(ds[0]["audio"]["array"], return_tensors="pt") >>> inputs = processor(ds[0]["audio"]["array"], return_tensors="pt")
>>> # Whisper has `begin_suppress_tokens` set by default (= `[220, 50256]`). 50256 is the EOS token, so this means >>> # Whisper has `begin_suppress_tokens` set by default (= `[220, 50256]`). 50256 is the EOS token, so this means
...@@ -1812,7 +1812,7 @@ class SuppressTokensLogitsProcessor(LogitsProcessor): ...@@ -1812,7 +1812,7 @@ class SuppressTokensLogitsProcessor(LogitsProcessor):
>>> processor = AutoProcessor.from_pretrained("openai/whisper-tiny.en") >>> processor = AutoProcessor.from_pretrained("openai/whisper-tiny.en")
>>> model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny.en") >>> model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny.en")
>>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True)
>>> inputs = processor(ds[0]["audio"]["array"], return_tensors="pt") >>> inputs = processor(ds[0]["audio"]["array"], return_tensors="pt")
>>> # Whisper has a long list of suppressed tokens. For instance, in this case, the token 1 is suppressed by default. >>> # Whisper has a long list of suppressed tokens. For instance, in this case, the token 1 is suppressed by default.
...@@ -1901,7 +1901,7 @@ class WhisperTimeStampLogitsProcessor(LogitsProcessor): ...@@ -1901,7 +1901,7 @@ class WhisperTimeStampLogitsProcessor(LogitsProcessor):
>>> processor = AutoProcessor.from_pretrained("openai/whisper-tiny.en") >>> processor = AutoProcessor.from_pretrained("openai/whisper-tiny.en")
>>> model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny.en") >>> model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny.en")
>>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True)
>>> inputs = processor(ds[3]["audio"]["array"], return_tensors="pt") >>> inputs = processor(ds[3]["audio"]["array"], return_tensors="pt")
>>> input_features = inputs.input_features >>> input_features = inputs.input_features
......
...@@ -205,7 +205,8 @@ def convert_audio_spectrogram_transformer_checkpoint(model_name, pytorch_dump_fo ...@@ -205,7 +205,8 @@ def convert_audio_spectrogram_transformer_checkpoint(model_name, pytorch_dump_fo
feature_extractor = ASTFeatureExtractor(mean=mean, std=std, max_length=max_length) feature_extractor = ASTFeatureExtractor(mean=mean, std=std, max_length=max_length)
if "speech-commands" in model_name: if "speech-commands" in model_name:
dataset = load_dataset("speech_commands", "v0.02", split="validation") # TODO: Convert dataset to Parquet
dataset = load_dataset("google/speech_commands", "v0.02", split="validation", trust_remote_code=True)
waveform = dataset[0]["audio"]["array"] waveform = dataset[0]["audio"]["array"]
else: else:
filepath = hf_hub_download( filepath = hf_hub_download(
......
...@@ -266,7 +266,7 @@ def convert_beit_checkpoint(checkpoint_url, pytorch_dump_folder_path): ...@@ -266,7 +266,7 @@ def convert_beit_checkpoint(checkpoint_url, pytorch_dump_folder_path):
# Check outputs on an image # Check outputs on an image
if is_semantic: if is_semantic:
image_processor = BeitImageProcessor(size=config.image_size, do_center_crop=False) image_processor = BeitImageProcessor(size=config.image_size, do_center_crop=False)
ds = load_dataset("hf-internal-testing/fixtures_ade20k", split="test") ds = load_dataset("hf-internal-testing/fixtures_ade20k", split="test", trust_remote_code=True)
image = Image.open(ds[0]["file"]) image = Image.open(ds[0]["file"])
else: else:
image_processor = BeitImageProcessor( image_processor = BeitImageProcessor(
......
...@@ -2409,7 +2409,7 @@ class BigBirdForMaskedLM(BigBirdPreTrainedModel): ...@@ -2409,7 +2409,7 @@ class BigBirdForMaskedLM(BigBirdPreTrainedModel):
>>> tokenizer = AutoTokenizer.from_pretrained("google/bigbird-roberta-base") >>> tokenizer = AutoTokenizer.from_pretrained("google/bigbird-roberta-base")
>>> model = BigBirdForMaskedLM.from_pretrained("google/bigbird-roberta-base") >>> model = BigBirdForMaskedLM.from_pretrained("google/bigbird-roberta-base")
>>> squad_ds = load_dataset("squad_v2", split="train") # doctest: +IGNORE_RESULT >>> squad_ds = load_dataset("rajpurkar/squad_v2", split="train") # doctest: +IGNORE_RESULT
>>> # select random long article >>> # select random long article
>>> LONG_ARTICLE_TARGET = squad_ds[81514]["context"] >>> LONG_ARTICLE_TARGET = squad_ds[81514]["context"]
...@@ -2711,7 +2711,7 @@ class BigBirdForSequenceClassification(BigBirdPreTrainedModel): ...@@ -2711,7 +2711,7 @@ class BigBirdForSequenceClassification(BigBirdPreTrainedModel):
>>> tokenizer = AutoTokenizer.from_pretrained("l-yohai/bigbird-roberta-base-mnli") >>> tokenizer = AutoTokenizer.from_pretrained("l-yohai/bigbird-roberta-base-mnli")
>>> model = BigBirdForSequenceClassification.from_pretrained("l-yohai/bigbird-roberta-base-mnli") >>> model = BigBirdForSequenceClassification.from_pretrained("l-yohai/bigbird-roberta-base-mnli")
>>> squad_ds = load_dataset("squad_v2", split="train") # doctest: +IGNORE_RESULT >>> squad_ds = load_dataset("rajpurkar/squad_v2", split="train") # doctest: +IGNORE_RESULT
>>> LONG_ARTICLE = squad_ds[81514]["context"] >>> LONG_ARTICLE = squad_ds[81514]["context"]
>>> inputs = tokenizer(LONG_ARTICLE, return_tensors="pt") >>> inputs = tokenizer(LONG_ARTICLE, return_tensors="pt")
...@@ -3040,7 +3040,7 @@ class BigBirdForQuestionAnswering(BigBirdPreTrainedModel): ...@@ -3040,7 +3040,7 @@ class BigBirdForQuestionAnswering(BigBirdPreTrainedModel):
>>> tokenizer = AutoTokenizer.from_pretrained("google/bigbird-roberta-base") >>> tokenizer = AutoTokenizer.from_pretrained("google/bigbird-roberta-base")
>>> model = BigBirdForQuestionAnswering.from_pretrained("google/bigbird-roberta-base") >>> model = BigBirdForQuestionAnswering.from_pretrained("google/bigbird-roberta-base")
>>> squad_ds = load_dataset("squad_v2", split="train") # doctest: +IGNORE_RESULT >>> squad_ds = load_dataset("rajpurkar/squad_v2", split="train") # doctest: +IGNORE_RESULT
>>> # select random article and question >>> # select random article and question
>>> LONG_ARTICLE = squad_ds[81514]["context"] >>> LONG_ARTICLE = squad_ds[81514]["context"]
......
...@@ -1681,7 +1681,7 @@ class ClvpModelForConditionalGeneration(ClvpPreTrainedModel): ...@@ -1681,7 +1681,7 @@ class ClvpModelForConditionalGeneration(ClvpPreTrainedModel):
>>> # Define the Text and Load the Audio (We are taking an audio example from HuggingFace Hub using `datasets` library) >>> # Define the Text and Load the Audio (We are taking an audio example from HuggingFace Hub using `datasets` library)
>>> text = "This is an example text." >>> text = "This is an example text."
>>> ds = datasets.load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") >>> ds = datasets.load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True)
>>> ds = ds.cast_column("audio", datasets.Audio(sampling_rate=22050)) >>> ds = ds.cast_column("audio", datasets.Audio(sampling_rate=22050))
>>> _, audio, sr = ds.sort("id").select(range(1))[:1]["audio"][0].values() >>> _, audio, sr = ds.sort("id").select(range(1))[:1]["audio"][0].values()
...@@ -1754,7 +1754,7 @@ class ClvpModelForConditionalGeneration(ClvpPreTrainedModel): ...@@ -1754,7 +1754,7 @@ class ClvpModelForConditionalGeneration(ClvpPreTrainedModel):
>>> # Define the Text and Load the Audio (We are taking an audio example from HuggingFace Hub using `datasets` library) >>> # Define the Text and Load the Audio (We are taking an audio example from HuggingFace Hub using `datasets` library)
>>> text = "This is an example text." >>> text = "This is an example text."
>>> ds = datasets.load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") >>> ds = datasets.load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True)
>>> ds = ds.cast_column("audio", datasets.Audio(sampling_rate=22050)) >>> ds = ds.cast_column("audio", datasets.Audio(sampling_rate=22050))
>>> _, audio, sr = ds.sort("id").select(range(1))[:1]["audio"][0].values() >>> _, audio, sr = ds.sort("id").select(range(1))[:1]["audio"][0].values()
......
...@@ -226,7 +226,7 @@ def convert_wav2vec2_checkpoint( ...@@ -226,7 +226,7 @@ def convert_wav2vec2_checkpoint(
processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-large-lv60") processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-large-lv60")
ds = load_dataset("patrickvonplaten/librispeech_asr_dummy", "clean", split="validation") ds = load_dataset("patrickvonplaten/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True)
input_audio = [x["array"] for x in ds[:4]["audio"]] input_audio = [x["array"] for x in ds[:4]["audio"]]
inputs = processor(input_audio, return_tensors="pt", padding=True) inputs = processor(input_audio, return_tensors="pt", padding=True)
......
...@@ -831,7 +831,7 @@ class Speech2Text2ForCausalLM(Speech2Text2PreTrainedModel): ...@@ -831,7 +831,7 @@ class Speech2Text2ForCausalLM(Speech2Text2PreTrainedModel):
>>> model.config.decoder_start_token_id = tokenizer.bos_token_id >>> model.config.decoder_start_token_id = tokenizer.bos_token_id
>>> # pre-process inputs and labels >>> # pre-process inputs and labels
>>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True)
>>> inputs = feature_extractor( >>> inputs = feature_extractor(
... ds[0]["audio"]["array"], sampling_rate=ds[0]["audio"]["sampling_rate"], return_tensors="pt" ... ds[0]["audio"]["array"], sampling_rate=ds[0]["audio"]["sampling_rate"], return_tensors="pt"
... ) ... )
......
...@@ -148,7 +148,7 @@ def convert_donut_checkpoint(model_name, pytorch_dump_folder_path=None, push_to_ ...@@ -148,7 +148,7 @@ def convert_donut_checkpoint(model_name, pytorch_dump_folder_path=None, push_to_
model.load_state_dict(new_state_dict) model.load_state_dict(new_state_dict)
# verify results on scanned document # verify results on scanned document
dataset = load_dataset("hf-internal-testing/example-documents") dataset = load_dataset("hf-internal-testing/example-documents") # no-script
image = dataset["test"][0]["image"].convert("RGB") image = dataset["test"][0]["image"].convert("RGB")
tokenizer = XLMRobertaTokenizerFast.from_pretrained(model_name, from_slow=True) tokenizer = XLMRobertaTokenizerFast.from_pretrained(model_name, from_slow=True)
......
...@@ -1431,7 +1431,7 @@ class HubertModel(HubertPreTrainedModel): ...@@ -1431,7 +1431,7 @@ class HubertModel(HubertPreTrainedModel):
... return batch ... return batch
>>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True)
>>> ds = ds.map(map_to_array) >>> ds = ds.map(map_to_array)
>>> input_values = processor(ds["speech"][0], return_tensors="pt").input_values # Batch size 1 >>> input_values = processor(ds["speech"][0], return_tensors="pt").input_values # Batch size 1
......
...@@ -1471,7 +1471,7 @@ class TFHubertModel(TFHubertPreTrainedModel): ...@@ -1471,7 +1471,7 @@ class TFHubertModel(TFHubertPreTrainedModel):
... return batch ... return batch
>>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True)
>>> ds = ds.map(map_to_array) >>> ds = ds.map(map_to_array)
>>> input_values = processor(ds["speech"][0], return_tensors="tf").input_values # Batch size 1 >>> input_values = processor(ds["speech"][0], return_tensors="tf").input_values # Batch size 1
...@@ -1583,7 +1583,7 @@ class TFHubertForCTC(TFHubertPreTrainedModel): ...@@ -1583,7 +1583,7 @@ class TFHubertForCTC(TFHubertPreTrainedModel):
... return batch ... return batch
>>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True)
>>> ds = ds.map(map_to_array) >>> ds = ds.map(map_to_array)
>>> input_values = processor(ds["speech"][0], return_tensors="tf").input_values # Batch size 1 >>> input_values = processor(ds["speech"][0], return_tensors="tf").input_values # Batch size 1
......
...@@ -1294,7 +1294,7 @@ class LayoutLMForQuestionAnswering(LayoutLMPreTrainedModel): ...@@ -1294,7 +1294,7 @@ class LayoutLMForQuestionAnswering(LayoutLMPreTrainedModel):
>>> tokenizer = AutoTokenizer.from_pretrained("impira/layoutlm-document-qa", add_prefix_space=True) >>> tokenizer = AutoTokenizer.from_pretrained("impira/layoutlm-document-qa", add_prefix_space=True)
>>> model = LayoutLMForQuestionAnswering.from_pretrained("impira/layoutlm-document-qa", revision="1e3ebac") >>> model = LayoutLMForQuestionAnswering.from_pretrained("impira/layoutlm-document-qa", revision="1e3ebac")
>>> dataset = load_dataset("nielsr/funsd", split="train") >>> dataset = load_dataset("nielsr/funsd", split="train", trust_remote_code=True)
>>> example = dataset[0] >>> example = dataset[0]
>>> question = "what's his name?" >>> question = "what's his name?"
>>> words = example["words"] >>> words = example["words"]
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment