Unverified Commit 25b8b8a6 authored by AK391's avatar AK391 Committed by GitHub
Browse files

Merge branch 'huggingface:master' into master

parents 23801367 b67f345d
...@@ -37,6 +37,7 @@ jobs: ...@@ -37,6 +37,7 @@ jobs:
pip install --upgrade pip pip install --upgrade pip
pip install .[integrations,sklearn,testing,onnxruntime,sentencepiece,torch-speech,vision,timm] pip install .[integrations,sklearn,testing,onnxruntime,sentencepiece,torch-speech,vision,timm]
pip install https://github.com/kpu/kenlm/archive/master.zip pip install https://github.com/kpu/kenlm/archive/master.zip
python -m pip install 'git+https://github.com/facebookresearch/detectron2.git'
- name: Are GPUs recognized by our DL frameworks - name: Are GPUs recognized by our DL frameworks
run: | run: |
...@@ -241,6 +242,7 @@ jobs: ...@@ -241,6 +242,7 @@ jobs:
pip install --upgrade pip pip install --upgrade pip
pip install .[integrations,sklearn,testing,onnxruntime,sentencepiece,torch-speech,vision,timm] pip install .[integrations,sklearn,testing,onnxruntime,sentencepiece,torch-speech,vision,timm]
pip install https://github.com/kpu/kenlm/archive/master.zip pip install https://github.com/kpu/kenlm/archive/master.zip
python -m pip install 'git+https://github.com/facebookresearch/detectron2.git'
- name: Are GPUs recognized by our DL frameworks - name: Are GPUs recognized by our DL frameworks
run: | run: |
......
# Image Captioning (vision-encoder-text-decoder model) training example
The following example showcases how to finetune a vision-encoder-text-decoder model for image captioning
using the JAX/Flax backend, leveraging 🤗 Transformers library's [FlaxVisionEncoderDecoderModel](https://huggingface.co/docs/transformers/model_doc/visionencoderdecoder#transformers.FlaxVisionEncoderDecoderModel).
JAX/Flax allows you to trace pure functions and compile them into efficient, fused accelerator code on both GPU and TPU.
Models written in JAX/Flax are **immutable** and updated in a purely functional
way which enables simple and efficient model parallelism.
`run_image_captioning_flax.py` is a lightweight example of how to download and preprocess a dataset from the 🤗 Datasets
library or use your own files (jsonlines or csv), then fine-tune one of the architectures above on it.
For custom datasets in `jsonlines` format please see: https://huggingface.co/docs/datasets/loading_datasets.html#json-files and you also will find examples of these below.
### Download COCO dataset (2017)
This example uses COCO dataset (2017) through a custom dataset script, which requires users to manually download the
COCO dataset before training.
```bash
mkdir data
cd data
wget http://images.cocodataset.org/zips/train2017.zip
wget http://images.cocodataset.org/zips/val2017.zip
wget http://images.cocodataset.org/zips/test2017.zip
wget http://images.cocodataset.org/annotations/annotations_trainval2017.zip
wget http://images.cocodataset.org/annotations/image_info_test2017.zip
cd ..
```
### Create a model from a vision encoder model and a text decoder model
Next, we create a [FlaxVisionEncoderDecoderModel](https://huggingface.co/docs/transformers/model_doc/visionencoderdecoder#transformers.FlaxVisionEncoderDecoderModel) instance from a pre-trained vision encoder ([ViT](https://huggingface.co/docs/transformers/model_doc/vit#transformers.FlaxViTModel)) and a pre-trained text decoder ([GPT2](https://huggingface.co/docs/transformers/model_doc/gpt2#transformers.FlaxGPT2Model)):
```bash
python3 create_model_from_encoder_decoder_models.py \
--output_dir model \
--encoder_model_name_or_path google/vit-base-patch16-224-in21k \
--decoder_model_name_or_path gpt2
```
### Train the model
Finally, we can run the example script to train the model:
```bash
python3 run_image_captioning_flax.py \
--output_dir ./image-captioning-training-results \
--model_name_or_path model \
--dataset_name ydshieh/coco_dataset_script \
--dataset_config_name=2017 \
--data_dir $PWD/data \
--image_column image_path \
--caption_column caption \
--do_train --do_eval --predict_with_generate \
--num_train_epochs 1 \
--eval_steps 500 \
--learning_rate 3e-5 --warmup_steps 0 \
--per_device_train_batch_size 32 \
--per_device_eval_batch_size 32 \
--overwrite_output_dir \
--max_target_length 32 \
--num_beams 8 \
--preprocessing_num_workers 16 \
--logging_steps 10 \
--block_size 16384 \
--push_to_hub
```
This should finish in about 1h30 on Cloud TPU, with validation loss and ROUGE2 score of 2.0153 and 14.64 respectively
after 1 epoch. Training statistics can be accessed on [Models](https://huggingface.co/ydshieh/image-captioning-training-results/tensorboard).
#!/usr/bin/env python
# coding=utf-8
# Copyright 2022 The HuggingFace Team All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Create a VisionEncoderDecoderModel instance from pretrained encoder/decoder models.
The cross-attention will be randomly initialized.
"""
from dataclasses import dataclass, field
from typing import Optional
from transformers import (
AutoConfig,
AutoFeatureExtractor,
AutoTokenizer,
FlaxVisionEncoderDecoderModel,
HfArgumentParser,
)
@dataclass
class ModelArguments:
"""
Arguments pertaining to which model/config/tokenizer we are going to fine-tune, or train from scratch.
"""
output_dir: str = field(
metadata={"help": "The output directory where the model will be written."},
)
encoder_model_name_or_path: str = field(
metadata={
"help": "The encoder model checkpoint for weights initialization."
"Don't set if you want to train an encoder model from scratch."
},
)
decoder_model_name_or_path: str = field(
metadata={
"help": "The decoder model checkpoint for weights initialization."
"Don't set if you want to train a decoder model from scratch."
},
)
encoder_config_name: Optional[str] = field(
default=None, metadata={"help": "Pretrained encoder config name or path if not the same as encoder_model_name"}
)
decoder_config_name: Optional[str] = field(
default=None, metadata={"help": "Pretrained decoder config name or path if not the same as decoder_model_name"}
)
def main():
parser = HfArgumentParser((ModelArguments,))
(model_args,) = parser.parse_args_into_dataclasses()
# Load pretrained model and tokenizer
# Use explicit specified encoder config
if model_args.encoder_config_name:
encoder_config = AutoConfig.from_pretrained(model_args.encoder_config_name)
# Use pretrained encoder model's config
else:
encoder_config = AutoConfig.from_pretrained(model_args.encoder_model_name_or_path)
# Use explicit specified decoder config
if model_args.decoder_config_name:
decoder_config = AutoConfig.from_pretrained(model_args.decoder_config_name)
# Use pretrained decoder model's config
else:
decoder_config = AutoConfig.from_pretrained(model_args.decoder_model_name_or_path)
# necessary for `from_encoder_decoder_pretrained` when `decoder_config` is passed
decoder_config.is_decoder = True
decoder_config.add_cross_attention = True
model = FlaxVisionEncoderDecoderModel.from_encoder_decoder_pretrained(
encoder_pretrained_model_name_or_path=model_args.encoder_model_name_or_path,
decoder_pretrained_model_name_or_path=model_args.decoder_model_name_or_path,
encoder_config=encoder_config,
decoder_config=decoder_config,
)
# GPT2 only has bos/eos tokens but not decoder_start/pad tokens
decoder_start_token_id = decoder_config.decoder_start_token_id
pad_token_id = decoder_config.pad_token_id
if decoder_start_token_id is None:
decoder_start_token_id = decoder_config.bos_token_id
if pad_token_id is None:
pad_token_id = decoder_config.eos_token_id
# This is necessary to make Flax's generate() work
model.config.eos_token_id = decoder_config.eos_token_id
model.config.decoder_start_token_id = decoder_start_token_id
model.config.pad_token_id = pad_token_id
feature_extractor = AutoFeatureExtractor.from_pretrained(model_args.encoder_model_name_or_path)
tokenizer = AutoTokenizer.from_pretrained(model_args.decoder_model_name_or_path)
tokenizer.pad_token = tokenizer.convert_ids_to_tokens(model.config.pad_token_id)
model.save_pretrained(model_args.output_dir)
feature_extractor.save_pretrained(model_args.output_dir)
tokenizer.save_pretrained(model_args.output_dir)
if __name__ == "__main__":
main()
This diff is collapsed.
...@@ -359,7 +359,7 @@ def main(): ...@@ -359,7 +359,7 @@ def main():
raw_datasets["train"] = raw_datasets["train"].select(range(data_args.max_train_samples)) raw_datasets["train"] = raw_datasets["train"].select(range(data_args.max_train_samples))
if data_args.max_eval_samples is not None: if data_args.max_eval_samples is not None:
raw_datasets["eval"] = raw_datasets["eval"].select(range(data_args.max_train_samples)) raw_datasets["eval"] = raw_datasets["eval"].select(range(data_args.max_eval_samples))
def prepare_dataset(batch): def prepare_dataset(batch):
# process audio # process audio
......
...@@ -16,6 +16,22 @@ ...@@ -16,6 +16,22 @@
#################################################################################################### ####################################################################################################
#
# Note: If when running this conversion script you're getting an exception:
# ModuleNotFoundError: No module named 'megatron.model.enums'
# you need to tell python where to find the clone of Megatron-LM, e.g.:
#
# cd /tmp
# git clone https://github.com/NVIDIA/Megatron-LM
# PYTHONPATH=/tmp/Megatron-LM python src/transformers/models/megatron_bert/convert_megatron_bert_checkpoint.py ...
#
# if you already have it cloned elsewhere, simply adjust the path to the existing path
#
# If the training was done using a Megatron-LM fork, e.g.,
# https://github.com/microsoft/Megatron-DeepSpeed/ then chances are that you need to have that one
# in your path, i.e., /path/to/Megatron-DeepSpeed/
#
import argparse import argparse
import json import json
import os import os
......
...@@ -16,6 +16,22 @@ ...@@ -16,6 +16,22 @@
#################################################################################################### ####################################################################################################
#
# Note: If when running this conversion script you're getting an exception:
# ModuleNotFoundError: No module named 'megatron.model.enums'
# you need to tell python where to find the clone of Megatron-LM, e.g.:
#
# cd /tmp
# git clone https://github.com/NVIDIA/Megatron-LM
# PYTHONPATH=/tmp/Megatron-LM python src/transformers/models/megatron_gpt2/convert_megatron_gpt2_checkpoint.py ...
#
# if you already have it cloned elsewhere, simply adjust the path to the existing path
#
# If the training was done using a Megatron-LM fork, e.g.,
# https://github.com/microsoft/Megatron-DeepSpeed/ then chances are that you need to have that one
# in your path, i.e., /path/to/Megatron-DeepSpeed/
#
import argparse import argparse
import os import os
import re import re
......
...@@ -380,7 +380,7 @@ class SpeechEncoderDecoderModel(PreTrainedModel): ...@@ -380,7 +380,7 @@ class SpeechEncoderDecoderModel(PreTrainedModel):
) )
if "config" not in kwargs_encoder: if "config" not in kwargs_encoder:
encoder_config = AutoConfig.from_pretrained(encoder_pretrained_model_name_or_path, **kwargs_encoder) encoder_config = AutoConfig.from_pretrained(encoder_pretrained_model_name_or_path)
if encoder_config.is_decoder is True or encoder_config.add_cross_attention is True: if encoder_config.is_decoder is True or encoder_config.add_cross_attention is True:
logger.info( logger.info(
f"Initializing {encoder_pretrained_model_name_or_path} as a encoder model " f"Initializing {encoder_pretrained_model_name_or_path} as a encoder model "
...@@ -391,7 +391,7 @@ class SpeechEncoderDecoderModel(PreTrainedModel): ...@@ -391,7 +391,7 @@ class SpeechEncoderDecoderModel(PreTrainedModel):
kwargs_encoder["config"] = encoder_config kwargs_encoder["config"] = encoder_config
encoder = AutoModel.from_pretrained(encoder_pretrained_model_name_or_path, *model_args) encoder = AutoModel.from_pretrained(encoder_pretrained_model_name_or_path, *model_args, **kwargs_encoder)
decoder = kwargs_decoder.pop("model", None) decoder = kwargs_decoder.pop("model", None)
if decoder is None: if decoder is None:
...@@ -402,7 +402,7 @@ class SpeechEncoderDecoderModel(PreTrainedModel): ...@@ -402,7 +402,7 @@ class SpeechEncoderDecoderModel(PreTrainedModel):
) )
if "config" not in kwargs_decoder: if "config" not in kwargs_decoder:
decoder_config = AutoConfig.from_pretrained(decoder_pretrained_model_name_or_path, **kwargs_decoder) decoder_config = AutoConfig.from_pretrained(decoder_pretrained_model_name_or_path)
if decoder_config.is_decoder is False or decoder_config.add_cross_attention is False: if decoder_config.is_decoder is False or decoder_config.add_cross_attention is False:
logger.info( logger.info(
f"Initializing {decoder_pretrained_model_name_or_path} as a decoder model. " f"Initializing {decoder_pretrained_model_name_or_path} as a decoder model. "
...@@ -424,7 +424,7 @@ class SpeechEncoderDecoderModel(PreTrainedModel): ...@@ -424,7 +424,7 @@ class SpeechEncoderDecoderModel(PreTrainedModel):
"`decoder_config` to `.from_encoder_decoder_pretrained(...)`" "`decoder_config` to `.from_encoder_decoder_pretrained(...)`"
) )
decoder = AutoModelForCausalLM.from_pretrained(decoder_pretrained_model_name_or_path) decoder = AutoModelForCausalLM.from_pretrained(decoder_pretrained_model_name_or_path, **kwargs_decoder)
# instantiate config with corresponding kwargs # instantiate config with corresponding kwargs
config = SpeechEncoderDecoderConfig.from_encoder_decoder_configs(encoder.config, decoder.config, **kwargs) config = SpeechEncoderDecoderConfig.from_encoder_decoder_configs(encoder.config, decoder.config, **kwargs)
......
...@@ -1290,6 +1290,7 @@ class UniSpeechForPreTraining(UniSpeechPreTrainedModel): ...@@ -1290,6 +1290,7 @@ class UniSpeechForPreTraining(UniSpeechPreTrainedModel):
>>> batch_size, raw_sequence_length = input_values.shape >>> batch_size, raw_sequence_length = input_values.shape
>>> sequence_length = model._get_feat_extract_output_lengths(raw_sequence_length) >>> sequence_length = model._get_feat_extract_output_lengths(raw_sequence_length)
>>> mask_time_indices = _compute_mask_indices((batch_size, sequence_length), mask_prob=0.2, mask_length=2) >>> mask_time_indices = _compute_mask_indices((batch_size, sequence_length), mask_prob=0.2, mask_length=2)
>>> mask_time_indices = torch.tensor(mask_time_indices, device=input_values.device, dtype=torch.long)
>>> with torch.no_grad(): >>> with torch.no_grad():
... outputs = model(input_values, mask_time_indices=mask_time_indices) ... outputs = model(input_values, mask_time_indices=mask_time_indices)
......
...@@ -1322,6 +1322,7 @@ class UniSpeechSatForPreTraining(UniSpeechSatPreTrainedModel): ...@@ -1322,6 +1322,7 @@ class UniSpeechSatForPreTraining(UniSpeechSatPreTrainedModel):
>>> batch_size, raw_sequence_length = input_values.shape >>> batch_size, raw_sequence_length = input_values.shape
>>> sequence_length = model._get_feat_extract_output_lengths(raw_sequence_length) >>> sequence_length = model._get_feat_extract_output_lengths(raw_sequence_length)
>>> mask_time_indices = _compute_mask_indices((batch_size, sequence_length), mask_prob=0.2, mask_length=2) >>> mask_time_indices = _compute_mask_indices((batch_size, sequence_length), mask_prob=0.2, mask_length=2)
>>> mask_time_indices = torch.tensor(mask_time_indices, device=input_values.device, dtype=torch.long)
>>> with torch.no_grad(): >>> with torch.no_grad():
... outputs = model(input_values, mask_time_indices=mask_time_indices) ... outputs = model(input_values, mask_time_indices=mask_time_indices)
......
...@@ -1460,6 +1460,7 @@ class Wav2Vec2ForPreTraining(Wav2Vec2PreTrainedModel): ...@@ -1460,6 +1460,7 @@ class Wav2Vec2ForPreTraining(Wav2Vec2PreTrainedModel):
>>> batch_size, raw_sequence_length = input_values.shape >>> batch_size, raw_sequence_length = input_values.shape
>>> sequence_length = model._get_feat_extract_output_lengths(raw_sequence_length) >>> sequence_length = model._get_feat_extract_output_lengths(raw_sequence_length)
>>> mask_time_indices = _compute_mask_indices((batch_size, sequence_length), mask_prob=0.2, mask_length=2) >>> mask_time_indices = _compute_mask_indices((batch_size, sequence_length), mask_prob=0.2, mask_length=2)
>>> mask_time_indices = torch.tensor(mask_time_indices, device=input_values.device, dtype=torch.long)
>>> with torch.no_grad(): >>> with torch.no_grad():
... outputs = model(input_values, mask_time_indices=mask_time_indices) ... outputs = model(input_values, mask_time_indices=mask_time_indices)
......
...@@ -742,6 +742,8 @@ class Pipeline(_ScikitCompat): ...@@ -742,6 +742,8 @@ class Pipeline(_ScikitCompat):
self.model.config.update(task_specific_params.get(task)) self.model.config.update(task_specific_params.get(task))
self.call_count = 0 self.call_count = 0
self._batch_size = kwargs.pop("batch_size", None)
self._num_workers = kwargs.pop("num_workers", None)
self._preprocess_params, self._forward_params, self._postprocess_params = self._sanitize_parameters(**kwargs) self._preprocess_params, self._forward_params, self._postprocess_params = self._sanitize_parameters(**kwargs)
def save_pretrained(self, save_directory: str): def save_pretrained(self, save_directory: str):
...@@ -947,9 +949,21 @@ class Pipeline(_ScikitCompat): ...@@ -947,9 +949,21 @@ class Pipeline(_ScikitCompat):
final_iterator = PipelineIterator(model_iterator, self.postprocess, postprocess_params) final_iterator = PipelineIterator(model_iterator, self.postprocess, postprocess_params)
return final_iterator return final_iterator
def __call__(self, inputs, *args, num_workers=0, batch_size=1, **kwargs): def __call__(self, inputs, *args, num_workers=None, batch_size=None, **kwargs):
if args: if args:
logger.warning(f"Ignoring args : {args}") logger.warning(f"Ignoring args : {args}")
if num_workers is None:
if self._num_workers is None:
num_workers = 0
else:
num_workers = self._num_workers
if batch_size is None:
if self._batch_size is None:
batch_size = 1
else:
batch_size = self._batch_size
preprocess_params, forward_params, postprocess_params = self._sanitize_parameters(**kwargs) preprocess_params, forward_params, postprocess_params = self._sanitize_parameters(**kwargs)
# Fuse __init__ params and __call__ params without modifying the __init__ ones. # Fuse __init__ params and __call__ params without modifying the __init__ ones.
......
from typing import List, Union from typing import List, Union
from ..file_utils import add_end_docstrings, is_torch_available, is_vision_available, requires_backends from ..file_utils import (
add_end_docstrings,
is_tf_available,
is_torch_available,
is_vision_available,
requires_backends,
)
from ..utils import logging from ..utils import logging
from .base import PIPELINE_INIT_ARGS, Pipeline from .base import PIPELINE_INIT_ARGS, Pipeline
...@@ -10,6 +16,11 @@ if is_vision_available(): ...@@ -10,6 +16,11 @@ if is_vision_available():
from ..image_utils import load_image from ..image_utils import load_image
if is_tf_available():
import tensorflow as tf
from ..models.auto.modeling_tf_auto import TF_MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING
if is_torch_available(): if is_torch_available():
from ..models.auto.modeling_auto import MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING from ..models.auto.modeling_auto import MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING
...@@ -31,12 +42,12 @@ class ImageClassificationPipeline(Pipeline): ...@@ -31,12 +42,12 @@ class ImageClassificationPipeline(Pipeline):
def __init__(self, *args, **kwargs): def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs) super().__init__(*args, **kwargs)
if self.framework == "tf":
raise ValueError(f"The {self.__class__} is only available in PyTorch.")
requires_backends(self, "vision") requires_backends(self, "vision")
self.check_model_type(MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING) self.check_model_type(
TF_MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING
if self.framework == "tf"
else MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING
)
def _sanitize_parameters(self, top_k=None): def _sanitize_parameters(self, top_k=None):
postprocess_params = {} postprocess_params = {}
...@@ -77,7 +88,7 @@ class ImageClassificationPipeline(Pipeline): ...@@ -77,7 +88,7 @@ class ImageClassificationPipeline(Pipeline):
def preprocess(self, image): def preprocess(self, image):
image = load_image(image) image = load_image(image)
model_inputs = self.feature_extractor(images=image, return_tensors="pt") model_inputs = self.feature_extractor(images=image, return_tensors=self.framework)
return model_inputs return model_inputs
def _forward(self, model_inputs): def _forward(self, model_inputs):
...@@ -87,8 +98,16 @@ class ImageClassificationPipeline(Pipeline): ...@@ -87,8 +98,16 @@ class ImageClassificationPipeline(Pipeline):
def postprocess(self, model_outputs, top_k=5): def postprocess(self, model_outputs, top_k=5):
if top_k > self.model.config.num_labels: if top_k > self.model.config.num_labels:
top_k = self.model.config.num_labels top_k = self.model.config.num_labels
if self.framework == "pt":
probs = model_outputs.logits.softmax(-1)[0] probs = model_outputs.logits.softmax(-1)[0]
scores, ids = probs.topk(top_k) scores, ids = probs.topk(top_k)
elif self.framework == "tf":
probs = tf.nn.softmax(model_outputs.logits, axis=-1)[0]
topk = tf.math.top_k(probs, k=top_k)
scores, ids = topk.values.numpy(), topk.indices.numpy()
else:
raise ValueError(f"Unsupported framework: {self.framework}")
scores = scores.tolist() scores = scores.tolist()
ids = ids.tolist() ids = ids.tolist()
......
...@@ -307,6 +307,7 @@ class BertGenerationEncoderIntegrationTest(unittest.TestCase): ...@@ -307,6 +307,7 @@ class BertGenerationEncoderIntegrationTest(unittest.TestCase):
def test_inference_no_head_absolute_embedding(self): def test_inference_no_head_absolute_embedding(self):
model = BertGenerationEncoder.from_pretrained("google/bert_for_seq_generation_L-24_bbc_encoder") model = BertGenerationEncoder.from_pretrained("google/bert_for_seq_generation_L-24_bbc_encoder")
input_ids = torch.tensor([[101, 7592, 1010, 2026, 3899, 2003, 10140, 102]]) input_ids = torch.tensor([[101, 7592, 1010, 2026, 3899, 2003, 10140, 102]])
with torch.no_grad():
output = model(input_ids)[0] output = model(input_ids)[0]
expected_shape = torch.Size([1, 8, 1024]) expected_shape = torch.Size([1, 8, 1024])
self.assertEqual(output.shape, expected_shape) self.assertEqual(output.shape, expected_shape)
...@@ -322,6 +323,7 @@ class BertGenerationDecoderIntegrationTest(unittest.TestCase): ...@@ -322,6 +323,7 @@ class BertGenerationDecoderIntegrationTest(unittest.TestCase):
def test_inference_no_head_absolute_embedding(self): def test_inference_no_head_absolute_embedding(self):
model = BertGenerationDecoder.from_pretrained("google/bert_for_seq_generation_L-24_bbc_encoder") model = BertGenerationDecoder.from_pretrained("google/bert_for_seq_generation_L-24_bbc_encoder")
input_ids = torch.tensor([[101, 7592, 1010, 2026, 3899, 2003, 10140, 102]]) input_ids = torch.tensor([[101, 7592, 1010, 2026, 3899, 2003, 10140, 102]])
with torch.no_grad():
output = model(input_ids)[0] output = model(input_ids)[0]
expected_shape = torch.Size([1, 8, 50358]) expected_shape = torch.Size([1, 8, 50358])
self.assertEqual(output.shape, expected_shape) self.assertEqual(output.shape, expected_shape)
......
...@@ -858,6 +858,7 @@ def prepare_img(): ...@@ -858,6 +858,7 @@ def prepare_img():
@require_vision @require_vision
@require_torch
class CLIPModelIntegrationTest(unittest.TestCase): class CLIPModelIntegrationTest(unittest.TestCase):
@slow @slow
def test_inference(self): def test_inference(self):
......
...@@ -478,11 +478,11 @@ class LayoutLMv2ModelTest(ModelTesterMixin, unittest.TestCase): ...@@ -478,11 +478,11 @@ class LayoutLMv2ModelTest(ModelTesterMixin, unittest.TestCase):
def prepare_layoutlmv2_batch_inputs(): def prepare_layoutlmv2_batch_inputs():
# Here we prepare a batch of 2 sequences to test a LayoutLMv2 forward pass on: # Here we prepare a batch of 2 sequences to test a LayoutLMv2 forward pass on:
# fmt: off # fmt: off
input_ids = torch.tensor([[101,1019,1014,1016,1037,12849,4747,1004,14246,2278,5439,4524,5002,2930,2193,2930,4341,3208,1005,1055,2171,2848,11300,3531,102],[101,4070,4034,7020,1024,3058,1015,1013,2861,1013,6070,19274,2772,6205,27814,16147,16147,4343,2047,10283,10969,14389,1012,2338,102]],device=torch_device) # noqa: E231 input_ids = torch.tensor([[101,1019,1014,1016,1037,12849,4747,1004,14246,2278,5439,4524,5002,2930,2193,2930,4341,3208,1005,1055,2171,2848,11300,3531,102],[101,4070,4034,7020,1024,3058,1015,1013,2861,1013,6070,19274,2772,6205,27814,16147,16147,4343,2047,10283,10969,14389,1012,2338,102]]) # noqa: E231
bbox = torch.tensor([[[0,0,0,0],[423,237,440,251],[427,272,441,287],[419,115,437,129],[961,885,992,912],[256,38,330,58],[256,38,330,58],[336,42,353,57],[360,39,401,56],[360,39,401,56],[411,39,471,59],[479,41,528,59],[533,39,630,60],[67,113,134,131],[141,115,209,132],[68,149,133,166],[141,149,187,164],[195,148,287,165],[195,148,287,165],[195,148,287,165],[295,148,349,165],[441,149,492,166],[497,149,546,164],[64,201,125,218],[1000,1000,1000,1000]],[[0,0,0,0],[662,150,754,166],[665,199,742,211],[519,213,554,228],[519,213,554,228],[134,433,187,454],[130,467,204,480],[130,467,204,480],[130,467,204,480],[130,467,204,480],[130,467,204,480],[314,469,376,482],[504,684,582,706],[941,825,973,900],[941,825,973,900],[941,825,973,900],[941,825,973,900],[610,749,652,765],[130,659,168,672],[176,657,237,672],[238,657,312,672],[443,653,628,672],[443,653,628,672],[716,301,825,317],[1000,1000,1000,1000]]],device=torch_device) # noqa: E231 bbox = torch.tensor([[[0,0,0,0],[423,237,440,251],[427,272,441,287],[419,115,437,129],[961,885,992,912],[256,38,330,58],[256,38,330,58],[336,42,353,57],[360,39,401,56],[360,39,401,56],[411,39,471,59],[479,41,528,59],[533,39,630,60],[67,113,134,131],[141,115,209,132],[68,149,133,166],[141,149,187,164],[195,148,287,165],[195,148,287,165],[195,148,287,165],[295,148,349,165],[441,149,492,166],[497,149,546,164],[64,201,125,218],[1000,1000,1000,1000]],[[0,0,0,0],[662,150,754,166],[665,199,742,211],[519,213,554,228],[519,213,554,228],[134,433,187,454],[130,467,204,480],[130,467,204,480],[130,467,204,480],[130,467,204,480],[130,467,204,480],[314,469,376,482],[504,684,582,706],[941,825,973,900],[941,825,973,900],[941,825,973,900],[941,825,973,900],[610,749,652,765],[130,659,168,672],[176,657,237,672],[238,657,312,672],[443,653,628,672],[443,653,628,672],[716,301,825,317],[1000,1000,1000,1000]]]) # noqa: E231
image = ImageList(torch.randn((2,3,224,224)), image_sizes=[(224,224), (224,224)]) # noqa: E231 image = ImageList(torch.randn((2,3,224,224)), image_sizes=[(224,224), (224,224)]) # noqa: E231
attention_mask = torch.tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],],device=torch_device) # noqa: E231 attention_mask = torch.tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],]) # noqa: E231
token_type_ids = torch.tensor([[0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0],[0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]],device=torch_device) # noqa: E231 token_type_ids = torch.tensor([[0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0],[0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]]) # noqa: E231
# fmt: on # fmt: on
return input_ids, bbox, image, attention_mask, token_type_ids return input_ids, bbox, image, attention_mask, token_type_ids
...@@ -505,11 +505,11 @@ class LayoutLMv2ModelIntegrationTest(unittest.TestCase): ...@@ -505,11 +505,11 @@ class LayoutLMv2ModelIntegrationTest(unittest.TestCase):
# forward pass # forward pass
outputs = model( outputs = model(
input_ids=input_ids, input_ids=input_ids.to(torch_device),
bbox=bbox, bbox=bbox.to(torch_device),
image=image, image=image.to(torch_device),
attention_mask=attention_mask, attention_mask=attention_mask.to(torch_device),
token_type_ids=token_type_ids, token_type_ids=token_type_ids.to(torch_device),
) )
# verify the sequence output # verify the sequence output
......
...@@ -485,6 +485,7 @@ class RobertaModelIntegrationTest(TestCasePlus): ...@@ -485,6 +485,7 @@ class RobertaModelIntegrationTest(TestCasePlus):
model = RobertaForMaskedLM.from_pretrained("roberta-base") model = RobertaForMaskedLM.from_pretrained("roberta-base")
input_ids = torch.tensor([[0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2]]) input_ids = torch.tensor([[0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2]])
with torch.no_grad():
output = model(input_ids)[0] output = model(input_ids)[0]
expected_shape = torch.Size((1, 11, 50265)) expected_shape = torch.Size((1, 11, 50265))
self.assertEqual(output.shape, expected_shape) self.assertEqual(output.shape, expected_shape)
...@@ -504,6 +505,7 @@ class RobertaModelIntegrationTest(TestCasePlus): ...@@ -504,6 +505,7 @@ class RobertaModelIntegrationTest(TestCasePlus):
model = RobertaModel.from_pretrained("roberta-base") model = RobertaModel.from_pretrained("roberta-base")
input_ids = torch.tensor([[0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2]]) input_ids = torch.tensor([[0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2]])
with torch.no_grad():
output = model(input_ids)[0] output = model(input_ids)[0]
# compare the actual values for a slice. # compare the actual values for a slice.
expected_slice = torch.tensor( expected_slice = torch.tensor(
...@@ -521,6 +523,7 @@ class RobertaModelIntegrationTest(TestCasePlus): ...@@ -521,6 +523,7 @@ class RobertaModelIntegrationTest(TestCasePlus):
model = RobertaForSequenceClassification.from_pretrained("roberta-large-mnli") model = RobertaForSequenceClassification.from_pretrained("roberta-large-mnli")
input_ids = torch.tensor([[0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2]]) input_ids = torch.tensor([[0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2]])
with torch.no_grad():
output = model(input_ids)[0] output = model(input_ids)[0]
expected_shape = torch.Size((1, 3)) expected_shape = torch.Size((1, 3))
self.assertEqual(output.shape, expected_shape) self.assertEqual(output.shape, expected_shape)
......
...@@ -254,7 +254,7 @@ class TFCLIPVisionModelTest(TFModelTesterMixin, unittest.TestCase): ...@@ -254,7 +254,7 @@ class TFCLIPVisionModelTest(TFModelTesterMixin, unittest.TestCase):
@slow @slow
def test_model_from_pretrained(self): def test_model_from_pretrained(self):
for model_name in TF_CLIP_PRETRAINED_MODEL_ARCHIVE_LIST[:1]: for model_name in TF_CLIP_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
model = TFCLIPVisionModel.from_pretrained(model_name, from_pt=True) model = TFCLIPVisionModel.from_pretrained(model_name)
self.assertIsNotNone(model) self.assertIsNotNone(model)
...@@ -359,7 +359,7 @@ class TFCLIPTextModelTest(TFModelTesterMixin, unittest.TestCase): ...@@ -359,7 +359,7 @@ class TFCLIPTextModelTest(TFModelTesterMixin, unittest.TestCase):
@slow @slow
def test_model_from_pretrained(self): def test_model_from_pretrained(self):
for model_name in TF_CLIP_PRETRAINED_MODEL_ARCHIVE_LIST[:1]: for model_name in TF_CLIP_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
model = TFCLIPTextModel.from_pretrained(model_name, from_pt=True) model = TFCLIPTextModel.from_pretrained(model_name)
self.assertIsNotNone(model) self.assertIsNotNone(model)
...@@ -618,7 +618,7 @@ class TFCLIPModelTest(TFModelTesterMixin, unittest.TestCase): ...@@ -618,7 +618,7 @@ class TFCLIPModelTest(TFModelTesterMixin, unittest.TestCase):
@slow @slow
def test_model_from_pretrained(self): def test_model_from_pretrained(self):
for model_name in TF_CLIP_PRETRAINED_MODEL_ARCHIVE_LIST[:1]: for model_name in TF_CLIP_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
model = TFCLIPModel.from_pretrained(model_name, from_pt=True) model = TFCLIPModel.from_pretrained(model_name)
self.assertIsNotNone(model) self.assertIsNotNone(model)
...@@ -630,11 +630,12 @@ def prepare_img(): ...@@ -630,11 +630,12 @@ def prepare_img():
@require_vision @require_vision
@require_tf
class TFCLIPModelIntegrationTest(unittest.TestCase): class TFCLIPModelIntegrationTest(unittest.TestCase):
@slow @slow
def test_inference(self): def test_inference(self):
model_name = "openai/clip-vit-base-patch32" model_name = "openai/clip-vit-base-patch32"
model = TFCLIPModel.from_pretrained(model_name, from_pt=True) model = TFCLIPModel.from_pretrained(model_name)
processor = CLIPProcessor.from_pretrained(model_name) processor = CLIPProcessor.from_pretrained(model_name)
image = prepare_img() image = prepare_img()
......
...@@ -299,6 +299,16 @@ class CommonPipelineTest(unittest.TestCase): ...@@ -299,6 +299,16 @@ class CommonPipelineTest(unittest.TestCase):
self.assertIsInstance(pipe, TextClassificationPipeline) self.assertIsInstance(pipe, TextClassificationPipeline)
@require_torch
def test_pipeline_batch_size_global(self):
pipe = pipeline(model="hf-internal-testing/tiny-random-distilbert")
self.assertEqual(pipe._batch_size, None)
self.assertEqual(pipe._num_workers, None)
pipe = pipeline(model="hf-internal-testing/tiny-random-distilbert", batch_size=2, num_workers=1)
self.assertEqual(pipe._batch_size, 2)
self.assertEqual(pipe._num_workers, 1)
@require_torch @require_torch
def test_pipeline_override(self): def test_pipeline_override(self):
class MyPipeline(TextClassificationPipeline): class MyPipeline(TextClassificationPipeline):
......
...@@ -14,7 +14,12 @@ ...@@ -14,7 +14,12 @@
import unittest import unittest
from transformers import MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING, PreTrainedTokenizer, is_vision_available from transformers import (
MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING,
TF_MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING,
PreTrainedTokenizer,
is_vision_available,
)
from transformers.pipelines import ImageClassificationPipeline, pipeline from transformers.pipelines import ImageClassificationPipeline, pipeline
from transformers.testing_utils import ( from transformers.testing_utils import (
is_pipeline_test, is_pipeline_test,
...@@ -40,9 +45,9 @@ else: ...@@ -40,9 +45,9 @@ else:
@is_pipeline_test @is_pipeline_test
@require_vision @require_vision
@require_torch
class ImageClassificationPipelineTests(unittest.TestCase, metaclass=PipelineTestCaseMeta): class ImageClassificationPipelineTests(unittest.TestCase, metaclass=PipelineTestCaseMeta):
model_mapping = MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING model_mapping = MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING
tf_model_mapping = TF_MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING
def get_test_pipeline(self, model, tokenizer, feature_extractor): def get_test_pipeline(self, model, tokenizer, feature_extractor):
image_classifier = ImageClassificationPipeline(model=model, feature_extractor=feature_extractor, top_k=2) image_classifier = ImageClassificationPipeline(model=model, feature_extractor=feature_extractor, top_k=2)
...@@ -145,9 +150,42 @@ class ImageClassificationPipelineTests(unittest.TestCase, metaclass=PipelineTest ...@@ -145,9 +150,42 @@ class ImageClassificationPipelineTests(unittest.TestCase, metaclass=PipelineTest
) )
@require_tf @require_tf
@unittest.skip("Image classification is not implemented for TF")
def test_small_model_tf(self): def test_small_model_tf(self):
pass small_model = "lysandre/tiny-vit-random"
image_classifier = pipeline("image-classification", model=small_model)
outputs = image_classifier("http://images.cocodataset.org/val2017/000000039769.jpg")
self.assertEqual(
nested_simplify(outputs, decimals=4),
[
{"score": 0.0015, "label": "chambered nautilus, pearly nautilus, nautilus"},
{"score": 0.0015, "label": "pajama, pyjama, pj's, jammies"},
{"score": 0.0014, "label": "trench coat"},
{"score": 0.0014, "label": "handkerchief, hankie, hanky, hankey"},
{"score": 0.0014, "label": "baboon"},
],
)
outputs = image_classifier(
[
"http://images.cocodataset.org/val2017/000000039769.jpg",
"http://images.cocodataset.org/val2017/000000039769.jpg",
],
top_k=2,
)
self.assertEqual(
nested_simplify(outputs, decimals=4),
[
[
{"score": 0.0015, "label": "chambered nautilus, pearly nautilus, nautilus"},
{"score": 0.0015, "label": "pajama, pyjama, pj's, jammies"},
],
[
{"score": 0.0015, "label": "chambered nautilus, pearly nautilus, nautilus"},
{"score": 0.0015, "label": "pajama, pyjama, pj's, jammies"},
],
],
)
def test_custom_tokenizer(self): def test_custom_tokenizer(self):
tokenizer = PreTrainedTokenizer() tokenizer = PreTrainedTokenizer()
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment