Unverified Commit ae454f41 authored by amyeroberts's avatar amyeroberts Committed by GitHub
Browse files

Update old existing feature extractor references (#24552)

* Update old existing feature extractor references

* Typo

* Apply suggestions from code review

* Apply suggestions from code review

* Apply suggestions from code review

* Address comments from review - update 'feature extractor'
Co-authored by: Yih-Dar <2521628+ydshieh@users.noreply.github.com>
parent 10c2ac7b
...@@ -45,7 +45,7 @@ if is_tf_available(): ...@@ -45,7 +45,7 @@ if is_tf_available():
if is_vision_available(): if is_vision_available():
from PIL import Image from PIL import Image
from transformers import AutoFeatureExtractor from transformers import AutoImageProcessor
class TFSwinModelTester: class TFSwinModelTester:
...@@ -382,9 +382,9 @@ class TFSwinModelTest(TFModelTesterMixin, PipelineTesterMixin, unittest.TestCase ...@@ -382,9 +382,9 @@ class TFSwinModelTest(TFModelTesterMixin, PipelineTesterMixin, unittest.TestCase
@require_tf @require_tf
class TFSwinModelIntegrationTest(unittest.TestCase): class TFSwinModelIntegrationTest(unittest.TestCase):
@cached_property @cached_property
def default_feature_extractor(self): def default_image_processor(self):
return ( return (
AutoFeatureExtractor.from_pretrained("microsoft/swin-tiny-patch4-window7-224") AutoImageProcessor.from_pretrained("microsoft/swin-tiny-patch4-window7-224")
if is_vision_available() if is_vision_available()
else None else None
) )
...@@ -392,10 +392,10 @@ class TFSwinModelIntegrationTest(unittest.TestCase): ...@@ -392,10 +392,10 @@ class TFSwinModelIntegrationTest(unittest.TestCase):
@slow @slow
def test_inference_image_classification_head(self): def test_inference_image_classification_head(self):
model = TFSwinForImageClassification.from_pretrained("microsoft/swin-tiny-patch4-window7-224") model = TFSwinForImageClassification.from_pretrained("microsoft/swin-tiny-patch4-window7-224")
feature_extractor = self.default_feature_extractor image_processor = self.default_image_processor
image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png") image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
inputs = feature_extractor(images=image, return_tensors="tf") inputs = image_processor(images=image, return_tensors="tf")
# forward pass # forward pass
outputs = model(inputs) outputs = model(inputs)
......
...@@ -36,7 +36,7 @@ if is_torch_available(): ...@@ -36,7 +36,7 @@ if is_torch_available():
if is_vision_available(): if is_vision_available():
from PIL import Image from PIL import Image
from transformers import AutoFeatureExtractor from transformers import AutoImageProcessor
class Swinv2ModelTester: class Swinv2ModelTester:
...@@ -412,9 +412,9 @@ class Swinv2ModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase): ...@@ -412,9 +412,9 @@ class Swinv2ModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
@require_torch @require_torch
class Swinv2ModelIntegrationTest(unittest.TestCase): class Swinv2ModelIntegrationTest(unittest.TestCase):
@cached_property @cached_property
def default_feature_extractor(self): def default_image_processor(self):
return ( return (
AutoFeatureExtractor.from_pretrained("microsoft/swinv2-tiny-patch4-window8-256") AutoImageProcessor.from_pretrained("microsoft/swinv2-tiny-patch4-window8-256")
if is_vision_available() if is_vision_available()
else None else None
) )
...@@ -424,10 +424,10 @@ class Swinv2ModelIntegrationTest(unittest.TestCase): ...@@ -424,10 +424,10 @@ class Swinv2ModelIntegrationTest(unittest.TestCase):
model = Swinv2ForImageClassification.from_pretrained("microsoft/swinv2-tiny-patch4-window8-256").to( model = Swinv2ForImageClassification.from_pretrained("microsoft/swinv2-tiny-patch4-window8-256").to(
torch_device torch_device
) )
feature_extractor = self.default_feature_extractor image_processor = self.default_image_processor
image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png") image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
inputs = feature_extractor(images=image, return_tensors="pt").to(torch_device) inputs = image_processor(images=image, return_tensors="pt").to(torch_device)
# forward pass # forward pass
with torch.no_grad(): with torch.no_grad():
......
...@@ -39,7 +39,7 @@ if is_timm_available(): ...@@ -39,7 +39,7 @@ if is_timm_available():
if is_vision_available(): if is_vision_available():
from PIL import Image from PIL import Image
from transformers import AutoFeatureExtractor from transformers import AutoImageProcessor
class TableTransformerModelTester: class TableTransformerModelTester:
...@@ -501,13 +501,13 @@ def prepare_img(): ...@@ -501,13 +501,13 @@ def prepare_img():
@slow @slow
class TableTransformerModelIntegrationTests(unittest.TestCase): class TableTransformerModelIntegrationTests(unittest.TestCase):
def test_table_detection(self): def test_table_detection(self):
feature_extractor = AutoFeatureExtractor.from_pretrained("microsoft/table-transformer-detection") image_processor = AutoImageProcessor.from_pretrained("microsoft/table-transformer-detection")
model = TableTransformerForObjectDetection.from_pretrained("microsoft/table-transformer-detection") model = TableTransformerForObjectDetection.from_pretrained("microsoft/table-transformer-detection")
model.to(torch_device) model.to(torch_device)
file_path = hf_hub_download(repo_id="nielsr/example-pdf", repo_type="dataset", filename="example_pdf.png") file_path = hf_hub_download(repo_id="nielsr/example-pdf", repo_type="dataset", filename="example_pdf.png")
image = Image.open(file_path).convert("RGB") image = Image.open(file_path).convert("RGB")
inputs = feature_extractor(image, return_tensors="pt").to(torch_device) inputs = image_processor(image, return_tensors="pt").to(torch_device)
# forward pass # forward pass
with torch.no_grad(): with torch.no_grad():
......
...@@ -45,7 +45,7 @@ if is_torch_available(): ...@@ -45,7 +45,7 @@ if is_torch_available():
if is_vision_available(): if is_vision_available():
from transformers import VideoMAEFeatureExtractor from transformers import VideoMAEImageProcessor
class TimesformerModelTester: class TimesformerModelTester:
...@@ -339,10 +339,10 @@ def prepare_video(): ...@@ -339,10 +339,10 @@ def prepare_video():
@require_vision @require_vision
class TimesformerModelIntegrationTest(unittest.TestCase): class TimesformerModelIntegrationTest(unittest.TestCase):
@cached_property @cached_property
def default_feature_extractor(self): def default_image_processor(self):
# logits were tested with a different mean and std, so we use the same here # logits were tested with a different mean and std, so we use the same here
return ( return (
VideoMAEFeatureExtractor(image_mean=[0.5, 0.5, 0.5], image_std=[0.5, 0.5, 0.5]) VideoMAEImageProcessor(image_mean=[0.5, 0.5, 0.5], image_std=[0.5, 0.5, 0.5])
if is_vision_available() if is_vision_available()
else None else None
) )
...@@ -353,9 +353,9 @@ class TimesformerModelIntegrationTest(unittest.TestCase): ...@@ -353,9 +353,9 @@ class TimesformerModelIntegrationTest(unittest.TestCase):
torch_device torch_device
) )
feature_extractor = self.default_feature_extractor image_processor = self.default_image_processor
video = prepare_video() video = prepare_video()
inputs = feature_extractor(video[:8], return_tensors="pt").to(torch_device) inputs = image_processor(video[:8], return_tensors="pt").to(torch_device)
# forward pass # forward pass
with torch.no_grad(): with torch.no_grad():
......
...@@ -564,7 +564,7 @@ def prepare_audio(num_samples=1): ...@@ -564,7 +564,7 @@ def prepare_audio(num_samples=1):
@require_vision @require_vision
class TvltModelIntegrationTest(unittest.TestCase): class TvltModelIntegrationTest(unittest.TestCase):
@cached_property @cached_property
def default_feature_extractor(self): def default_processors(self):
# logits were tested with a different mean and std, so we use the same here # logits were tested with a different mean and std, so we use the same here
return ( return (
TvltImageProcessor() if is_vision_available() else None, TvltImageProcessor() if is_vision_available() else None,
...@@ -574,7 +574,7 @@ class TvltModelIntegrationTest(unittest.TestCase): ...@@ -574,7 +574,7 @@ class TvltModelIntegrationTest(unittest.TestCase):
def test_inference_for_base_model(self): def test_inference_for_base_model(self):
model = TvltModel.from_pretrained("ZinengTang/tvlt-base").to(torch_device) model = TvltModel.from_pretrained("ZinengTang/tvlt-base").to(torch_device)
image_processor, audio_feature_extractor = self.default_feature_extractor image_processor, audio_feature_extractor = self.default_processors
video = prepare_video() video = prepare_video()
audio = prepare_audio() audio = prepare_audio()
video_inputs = image_processor(video, return_tensors="pt").to(torch_device) video_inputs = image_processor(video, return_tensors="pt").to(torch_device)
...@@ -596,7 +596,7 @@ class TvltModelIntegrationTest(unittest.TestCase): ...@@ -596,7 +596,7 @@ class TvltModelIntegrationTest(unittest.TestCase):
def test_inference_for_pretraining(self): def test_inference_for_pretraining(self):
model = TvltForPreTraining.from_pretrained("ZinengTang/tvlt-base").to(torch_device) model = TvltForPreTraining.from_pretrained("ZinengTang/tvlt-base").to(torch_device)
image_processor, audio_feature_extractor = self.default_feature_extractor image_processor, audio_feature_extractor = self.default_processors
video = prepare_video() video = prepare_video()
video_mixed = prepare_video() video_mixed = prepare_video()
audio = prepare_audio() audio = prepare_audio()
......
...@@ -42,7 +42,7 @@ if is_torch_available(): ...@@ -42,7 +42,7 @@ if is_torch_available():
if is_vision_available(): if is_vision_available():
from PIL import Image from PIL import Image
from transformers import AutoFeatureExtractor from transformers import AutoImageProcessor
class VanModelTester: class VanModelTester:
...@@ -254,16 +254,16 @@ def prepare_img(): ...@@ -254,16 +254,16 @@ def prepare_img():
@require_vision @require_vision
class VanModelIntegrationTest(unittest.TestCase): class VanModelIntegrationTest(unittest.TestCase):
@cached_property @cached_property
def default_feature_extractor(self): def default_image_processor(self):
return AutoFeatureExtractor.from_pretrained(VAN_PRETRAINED_MODEL_ARCHIVE_LIST[0]) return AutoImageProcessor.from_pretrained(VAN_PRETRAINED_MODEL_ARCHIVE_LIST[0])
@slow @slow
def test_inference_image_classification_head(self): def test_inference_image_classification_head(self):
model = VanForImageClassification.from_pretrained(VAN_PRETRAINED_MODEL_ARCHIVE_LIST[0]).to(torch_device) model = VanForImageClassification.from_pretrained(VAN_PRETRAINED_MODEL_ARCHIVE_LIST[0]).to(torch_device)
feature_extractor = self.default_feature_extractor image_processor = self.default_image_processor
image = prepare_img() image = prepare_img()
inputs = feature_extractor(images=image, return_tensors="pt").to(torch_device) inputs = image_processor(images=image, return_tensors="pt").to(torch_device)
# forward pass # forward pass
with torch.no_grad(): with torch.no_grad():
......
...@@ -46,7 +46,7 @@ if is_torch_available(): ...@@ -46,7 +46,7 @@ if is_torch_available():
if is_vision_available(): if is_vision_available():
from transformers import VideoMAEFeatureExtractor from transformers import VideoMAEImageProcessor
class VideoMAEModelTester: class VideoMAEModelTester:
...@@ -359,10 +359,10 @@ def prepare_video(): ...@@ -359,10 +359,10 @@ def prepare_video():
@require_vision @require_vision
class VideoMAEModelIntegrationTest(unittest.TestCase): class VideoMAEModelIntegrationTest(unittest.TestCase):
@cached_property @cached_property
def default_feature_extractor(self): def default_image_processor(self):
# logits were tested with a different mean and std, so we use the same here # logits were tested with a different mean and std, so we use the same here
return ( return (
VideoMAEFeatureExtractor(image_mean=[0.5, 0.5, 0.5], image_std=[0.5, 0.5, 0.5]) VideoMAEImageProcessor(image_mean=[0.5, 0.5, 0.5], image_std=[0.5, 0.5, 0.5])
if is_vision_available() if is_vision_available()
else None else None
) )
...@@ -373,9 +373,9 @@ class VideoMAEModelIntegrationTest(unittest.TestCase): ...@@ -373,9 +373,9 @@ class VideoMAEModelIntegrationTest(unittest.TestCase):
torch_device torch_device
) )
feature_extractor = self.default_feature_extractor image_processor = self.default_image_processor
video = prepare_video() video = prepare_video()
inputs = feature_extractor(video, return_tensors="pt").to(torch_device) inputs = image_processor(video, return_tensors="pt").to(torch_device)
# forward pass # forward pass
with torch.no_grad(): with torch.no_grad():
...@@ -393,9 +393,9 @@ class VideoMAEModelIntegrationTest(unittest.TestCase): ...@@ -393,9 +393,9 @@ class VideoMAEModelIntegrationTest(unittest.TestCase):
def test_inference_for_pretraining(self): def test_inference_for_pretraining(self):
model = VideoMAEForPreTraining.from_pretrained("MCG-NJU/videomae-base-short").to(torch_device) model = VideoMAEForPreTraining.from_pretrained("MCG-NJU/videomae-base-short").to(torch_device)
feature_extractor = self.default_feature_extractor image_processor = self.default_image_processor
video = prepare_video() video = prepare_video()
inputs = feature_extractor(video, return_tensors="pt").to(torch_device) inputs = image_processor(video, return_tensors="pt").to(torch_device)
# add boolean mask, indicating which patches to mask # add boolean mask, indicating which patches to mask
local_path = hf_hub_download(repo_id="hf-internal-testing/bool-masked-pos", filename="bool_masked_pos.pt") local_path = hf_hub_download(repo_id="hf-internal-testing/bool-masked-pos", filename="bool_masked_pos.pt")
......
...@@ -48,7 +48,7 @@ if is_torch_available(): ...@@ -48,7 +48,7 @@ if is_torch_available():
if is_vision_available(): if is_vision_available():
from PIL import Image from PIL import Image
from transformers import ViTFeatureExtractor from transformers import ViTImageProcessor
@require_flax @require_flax
...@@ -462,12 +462,12 @@ class FlaxViT2GPT2ModelIntegrationTest(unittest.TestCase): ...@@ -462,12 +462,12 @@ class FlaxViT2GPT2ModelIntegrationTest(unittest.TestCase):
def test_inference_coco_en(self): def test_inference_coco_en(self):
loc = "ydshieh/vit-gpt2-coco-en" loc = "ydshieh/vit-gpt2-coco-en"
feature_extractor = ViTFeatureExtractor.from_pretrained(loc) image_processor = ViTImageProcessor.from_pretrained(loc)
tokenizer = AutoTokenizer.from_pretrained(loc) tokenizer = AutoTokenizer.from_pretrained(loc)
model = FlaxVisionEncoderDecoderModel.from_pretrained(loc) model = FlaxVisionEncoderDecoderModel.from_pretrained(loc)
img = prepare_img() img = prepare_img()
pixel_values = feature_extractor(images=img, return_tensors="np").pixel_values pixel_values = image_processor(images=img, return_tensors="np").pixel_values
decoder_input_ids = np.array([[model.config.decoder_start_token_id]]) decoder_input_ids = np.array([[model.config.decoder_start_token_id]])
logits = model(pixel_values, decoder_input_ids)[0] logits = model(pixel_values, decoder_input_ids)[0]
......
...@@ -45,7 +45,7 @@ if is_tf_available(): ...@@ -45,7 +45,7 @@ if is_tf_available():
from transformers import ( from transformers import (
AutoConfig, AutoConfig,
AutoFeatureExtractor, AutoImageProcessor,
AutoTokenizer, AutoTokenizer,
TFAutoModel, TFAutoModel,
TFAutoModelForCausalLM, TFAutoModelForCausalLM,
...@@ -64,7 +64,7 @@ if is_torch_available(): ...@@ -64,7 +64,7 @@ if is_torch_available():
if is_vision_available(): if is_vision_available():
from PIL import Image from PIL import Image
from transformers import ViTFeatureExtractor from transformers import ViTImageProcessor
@require_tf @require_tf
...@@ -828,11 +828,11 @@ class TFVisionEncoderDecoderModelSaveLoadTests(unittest.TestCase): ...@@ -828,11 +828,11 @@ class TFVisionEncoderDecoderModelSaveLoadTests(unittest.TestCase):
load_weight_prefix = TFVisionEncoderDecoderModel.load_weight_prefix load_weight_prefix = TFVisionEncoderDecoderModel.load_weight_prefix
config = self.get_encoder_decoder_config() config = self.get_encoder_decoder_config()
feature_extractor = AutoFeatureExtractor.from_pretrained("google/vit-base-patch16-224-in21k") image_processor = AutoImageProcessor.from_pretrained("google/vit-base-patch16-224-in21k")
decoder_tokenizer = AutoTokenizer.from_pretrained("gpt2") decoder_tokenizer = AutoTokenizer.from_pretrained("gpt2")
img = prepare_img() img = prepare_img()
pixel_values = feature_extractor(images=img, return_tensors="tf").pixel_values pixel_values = image_processor(images=img, return_tensors="tf").pixel_values
decoder_input_ids = decoder_tokenizer("Linda Davis", return_tensors="tf").input_ids decoder_input_ids = decoder_tokenizer("Linda Davis", return_tensors="tf").input_ids
with tempfile.TemporaryDirectory() as tmp_dirname: with tempfile.TemporaryDirectory() as tmp_dirname:
...@@ -893,13 +893,13 @@ class TFViT2GPT2ModelIntegrationTest(unittest.TestCase): ...@@ -893,13 +893,13 @@ class TFViT2GPT2ModelIntegrationTest(unittest.TestCase):
def test_inference_coco_en(self): def test_inference_coco_en(self):
loc = "ydshieh/vit-gpt2-coco-en" loc = "ydshieh/vit-gpt2-coco-en"
feature_extractor = ViTFeatureExtractor.from_pretrained(loc) image_processor = ViTImageProcessor.from_pretrained(loc)
tokenizer = AutoTokenizer.from_pretrained(loc) tokenizer = AutoTokenizer.from_pretrained(loc)
model = TFVisionEncoderDecoderModel.from_pretrained(loc) model = TFVisionEncoderDecoderModel.from_pretrained(loc)
# We will verify our results on an image of cute cats # We will verify our results on an image of cute cats
img = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png") img = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
pixel_values = feature_extractor(images=img, return_tensors="tf").pixel_values pixel_values = image_processor(images=img, return_tensors="tf").pixel_values
decoder_input_ids = tf.constant([[model.config.decoder_start_token_id]]) decoder_input_ids = tf.constant([[model.config.decoder_start_token_id]])
......
...@@ -62,7 +62,7 @@ if is_vision_available(): ...@@ -62,7 +62,7 @@ if is_vision_available():
import PIL import PIL
from PIL import Image from PIL import Image
from transformers import ViTFeatureExtractor from transformers import ViTImageProcessor
@require_torch @require_torch
...@@ -749,7 +749,7 @@ class ViT2GPT2ModelIntegrationTest(unittest.TestCase): ...@@ -749,7 +749,7 @@ class ViT2GPT2ModelIntegrationTest(unittest.TestCase):
def test_inference_coco_en(self): def test_inference_coco_en(self):
loc = "ydshieh/vit-gpt2-coco-en" loc = "ydshieh/vit-gpt2-coco-en"
feature_extractor = ViTFeatureExtractor.from_pretrained(loc) image_processor = ViTImageProcessor.from_pretrained(loc)
tokenizer = AutoTokenizer.from_pretrained(loc) tokenizer = AutoTokenizer.from_pretrained(loc)
model = VisionEncoderDecoderModel.from_pretrained(loc) model = VisionEncoderDecoderModel.from_pretrained(loc)
model.to(torch_device) model.to(torch_device)
...@@ -757,7 +757,7 @@ class ViT2GPT2ModelIntegrationTest(unittest.TestCase): ...@@ -757,7 +757,7 @@ class ViT2GPT2ModelIntegrationTest(unittest.TestCase):
# We will verify our results on an image of cute cats # We will verify our results on an image of cute cats
img = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png") img = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
pixel_values = feature_extractor(images=img, return_tensors="pt").pixel_values.to(torch_device) pixel_values = image_processor(images=img, return_tensors="pt").pixel_values.to(torch_device)
decoder_input_ids = torch.tensor([[model.config.decoder_start_token_id]]).to(torch_device) decoder_input_ids = torch.tensor([[model.config.decoder_start_token_id]]).to(torch_device)
......
...@@ -170,10 +170,10 @@ class VisionTextDualEncoderProcessorTest(unittest.TestCase): ...@@ -170,10 +170,10 @@ class VisionTextDualEncoderProcessorTest(unittest.TestCase):
self.assertListEqual(decoded_tok, decoded_processor) self.assertListEqual(decoded_tok, decoded_processor)
def test_model_input_names(self): def test_model_input_names(self):
feature_extractor = self.get_image_processor() image_processor = self.get_image_processor()
tokenizer = self.get_tokenizer() tokenizer = self.get_tokenizer()
processor = VisionTextDualEncoderProcessor(tokenizer=tokenizer, feature_extractor=feature_extractor) processor = VisionTextDualEncoderProcessor(tokenizer=tokenizer, image_processor=image_processor)
input_str = "lower newer" input_str = "lower newer"
image_input = self.prepare_image_inputs() image_input = self.prepare_image_inputs()
......
...@@ -38,7 +38,7 @@ if is_tf_available(): ...@@ -38,7 +38,7 @@ if is_tf_available():
if is_vision_available(): if is_vision_available():
from PIL import Image from PIL import Image
from transformers import ViTFeatureExtractor from transformers import ViTImageProcessor
class TFViTModelTester: class TFViTModelTester:
...@@ -228,16 +228,16 @@ def prepare_img(): ...@@ -228,16 +228,16 @@ def prepare_img():
@require_vision @require_vision
class TFViTModelIntegrationTest(unittest.TestCase): class TFViTModelIntegrationTest(unittest.TestCase):
@cached_property @cached_property
def default_feature_extractor(self): def default_image_processor(self):
return ViTFeatureExtractor.from_pretrained("google/vit-base-patch16-224") if is_vision_available() else None return ViTImageProcessor.from_pretrained("google/vit-base-patch16-224") if is_vision_available() else None
@slow @slow
def test_inference_image_classification_head(self): def test_inference_image_classification_head(self):
model = TFViTForImageClassification.from_pretrained("google/vit-base-patch16-224") model = TFViTForImageClassification.from_pretrained("google/vit-base-patch16-224")
feature_extractor = self.default_feature_extractor image_processor = self.default_image_processor
image = prepare_img() image = prepare_img()
inputs = feature_extractor(images=image, return_tensors="tf") inputs = image_processor(images=image, return_tensors="tf")
# forward pass # forward pass
outputs = model(**inputs) outputs = model(**inputs)
......
...@@ -45,7 +45,7 @@ if is_torch_available(): ...@@ -45,7 +45,7 @@ if is_torch_available():
if is_vision_available(): if is_vision_available():
from PIL import Image from PIL import Image
from transformers import ViTFeatureExtractor from transformers import ViTImageProcessor
class ViTModelTester: class ViTModelTester:
...@@ -264,16 +264,16 @@ def prepare_img(): ...@@ -264,16 +264,16 @@ def prepare_img():
@require_vision @require_vision
class ViTModelIntegrationTest(unittest.TestCase): class ViTModelIntegrationTest(unittest.TestCase):
@cached_property @cached_property
def default_feature_extractor(self): def default_image_processor(self):
return ViTFeatureExtractor.from_pretrained("google/vit-base-patch16-224") if is_vision_available() else None return ViTImageProcessor.from_pretrained("google/vit-base-patch16-224") if is_vision_available() else None
@slow @slow
def test_inference_image_classification_head(self): def test_inference_image_classification_head(self):
model = ViTForImageClassification.from_pretrained("google/vit-base-patch16-224").to(torch_device) model = ViTForImageClassification.from_pretrained("google/vit-base-patch16-224").to(torch_device)
feature_extractor = self.default_feature_extractor image_processor = self.default_image_processor
image = prepare_img() image = prepare_img()
inputs = feature_extractor(images=image, return_tensors="pt").to(torch_device) inputs = image_processor(images=image, return_tensors="pt").to(torch_device)
# forward pass # forward pass
with torch.no_grad(): with torch.no_grad():
...@@ -295,9 +295,9 @@ class ViTModelIntegrationTest(unittest.TestCase): ...@@ -295,9 +295,9 @@ class ViTModelIntegrationTest(unittest.TestCase):
# to visualize self-attention on higher resolution images. # to visualize self-attention on higher resolution images.
model = ViTModel.from_pretrained("facebook/dino-vits8").to(torch_device) model = ViTModel.from_pretrained("facebook/dino-vits8").to(torch_device)
feature_extractor = ViTFeatureExtractor.from_pretrained("facebook/dino-vits8", size=480) image_processor = ViTImageProcessor.from_pretrained("facebook/dino-vits8", size=480)
image = prepare_img() image = prepare_img()
inputs = feature_extractor(images=image, return_tensors="pt") inputs = image_processor(images=image, return_tensors="pt")
pixel_values = inputs.pixel_values.to(torch_device) pixel_values = inputs.pixel_values.to(torch_device)
# forward pass # forward pass
...@@ -322,10 +322,10 @@ class ViTModelIntegrationTest(unittest.TestCase): ...@@ -322,10 +322,10 @@ class ViTModelIntegrationTest(unittest.TestCase):
A small test to make sure that inference work in half precision without any problem. A small test to make sure that inference work in half precision without any problem.
""" """
model = ViTModel.from_pretrained("facebook/dino-vits8", torch_dtype=torch.float16, device_map="auto") model = ViTModel.from_pretrained("facebook/dino-vits8", torch_dtype=torch.float16, device_map="auto")
feature_extractor = self.default_feature_extractor image_processor = self.default_image_processor
image = prepare_img() image = prepare_img()
inputs = feature_extractor(images=image, return_tensors="pt") inputs = image_processor(images=image, return_tensors="pt")
pixel_values = inputs.pixel_values.to(torch_device) pixel_values = inputs.pixel_values.to(torch_device)
# forward pass to make sure inference works in fp16 # forward pass to make sure inference works in fp16
......
...@@ -243,7 +243,7 @@ def prepare_img(): ...@@ -243,7 +243,7 @@ def prepare_img():
@require_vision @require_vision
class ViTModelIntegrationTest(unittest.TestCase): class ViTModelIntegrationTest(unittest.TestCase):
@cached_property @cached_property
def default_feature_extractor(self): def default_image_processor(self):
return ( return (
ViTHybridImageProcessor.from_pretrained(VIT_HYBRID_PRETRAINED_MODEL_ARCHIVE_LIST[0]) ViTHybridImageProcessor.from_pretrained(VIT_HYBRID_PRETRAINED_MODEL_ARCHIVE_LIST[0])
if is_vision_available() if is_vision_available()
...@@ -256,9 +256,9 @@ class ViTModelIntegrationTest(unittest.TestCase): ...@@ -256,9 +256,9 @@ class ViTModelIntegrationTest(unittest.TestCase):
torch_device torch_device
) )
feature_extractor = self.default_feature_extractor image_processor = self.default_image_processor
image = prepare_img() image = prepare_img()
inputs = feature_extractor(images=image, return_tensors="pt").to(torch_device) inputs = image_processor(images=image, return_tensors="pt").to(torch_device)
# forward pass # forward pass
with torch.no_grad(): with torch.no_grad():
...@@ -275,12 +275,12 @@ class ViTModelIntegrationTest(unittest.TestCase): ...@@ -275,12 +275,12 @@ class ViTModelIntegrationTest(unittest.TestCase):
@slow @slow
@require_accelerate @require_accelerate
def test_accelerate_inference(self): def test_accelerate_inference(self):
feature_extractor = ViTHybridImageProcessor.from_pretrained("google/vit-hybrid-base-bit-384") image_processor = ViTHybridImageProcessor.from_pretrained("google/vit-hybrid-base-bit-384")
model = ViTHybridForImageClassification.from_pretrained("google/vit-hybrid-base-bit-384", device_map="auto") model = ViTHybridForImageClassification.from_pretrained("google/vit-hybrid-base-bit-384", device_map="auto")
image = prepare_img() image = prepare_img()
inputs = feature_extractor(images=image, return_tensors="pt") inputs = image_processor(images=image, return_tensors="pt")
outputs = model(**inputs) outputs = model(**inputs)
logits = outputs.logits logits = outputs.logits
# model predicts one of the 1000 ImageNet classes # model predicts one of the 1000 ImageNet classes
......
...@@ -46,7 +46,7 @@ if is_tf_available(): ...@@ -46,7 +46,7 @@ if is_tf_available():
if is_vision_available(): if is_vision_available():
from PIL import Image from PIL import Image
from transformers import ViTFeatureExtractor from transformers import ViTImageProcessor
class TFViTMAEModelTester: class TFViTMAEModelTester:
...@@ -424,8 +424,8 @@ def prepare_img(): ...@@ -424,8 +424,8 @@ def prepare_img():
@require_vision @require_vision
class TFViTMAEModelIntegrationTest(unittest.TestCase): class TFViTMAEModelIntegrationTest(unittest.TestCase):
@cached_property @cached_property
def default_feature_extractor(self): def default_image_processor(self):
return ViTFeatureExtractor.from_pretrained("facebook/vit-mae-base") if is_vision_available() else None return ViTImageProcessor.from_pretrained("facebook/vit-mae-base") if is_vision_available() else None
@slow @slow
def test_inference_for_pretraining(self): def test_inference_for_pretraining(self):
...@@ -434,9 +434,9 @@ class TFViTMAEModelIntegrationTest(unittest.TestCase): ...@@ -434,9 +434,9 @@ class TFViTMAEModelIntegrationTest(unittest.TestCase):
model = TFViTMAEForPreTraining.from_pretrained("facebook/vit-mae-base") model = TFViTMAEForPreTraining.from_pretrained("facebook/vit-mae-base")
feature_extractor = self.default_feature_extractor image_processor = self.default_image_processor
image = prepare_img() image = prepare_img()
inputs = feature_extractor(images=image, return_tensors="tf") inputs = image_processor(images=image, return_tensors="tf")
# prepare a noise vector that will be also used for testing the TF model # prepare a noise vector that will be also used for testing the TF model
# (this way we can ensure that the PT and TF models operate on the same inputs) # (this way we can ensure that the PT and TF models operate on the same inputs)
......
...@@ -42,7 +42,7 @@ if is_torch_available(): ...@@ -42,7 +42,7 @@ if is_torch_available():
if is_vision_available(): if is_vision_available():
from PIL import Image from PIL import Image
from transformers import ViTFeatureExtractor from transformers import ViTImageProcessor
class ViTMAEModelTester: class ViTMAEModelTester:
...@@ -296,8 +296,8 @@ def prepare_img(): ...@@ -296,8 +296,8 @@ def prepare_img():
@require_vision @require_vision
class ViTMAEModelIntegrationTest(unittest.TestCase): class ViTMAEModelIntegrationTest(unittest.TestCase):
@cached_property @cached_property
def default_feature_extractor(self): def default_image_processor(self):
return ViTFeatureExtractor.from_pretrained("facebook/vit-mae-base") if is_vision_available() else None return ViTImageProcessor.from_pretrained("facebook/vit-mae-base") if is_vision_available() else None
@slow @slow
def test_inference_for_pretraining(self): def test_inference_for_pretraining(self):
...@@ -306,9 +306,9 @@ class ViTMAEModelIntegrationTest(unittest.TestCase): ...@@ -306,9 +306,9 @@ class ViTMAEModelIntegrationTest(unittest.TestCase):
model = ViTMAEForPreTraining.from_pretrained("facebook/vit-mae-base").to(torch_device) model = ViTMAEForPreTraining.from_pretrained("facebook/vit-mae-base").to(torch_device)
feature_extractor = self.default_feature_extractor image_processor = self.default_image_processor
image = prepare_img() image = prepare_img()
inputs = feature_extractor(images=image, return_tensors="pt").to(torch_device) inputs = image_processor(images=image, return_tensors="pt").to(torch_device)
# prepare a noise vector that will be also used for testing the TF model # prepare a noise vector that will be also used for testing the TF model
# (this way we can ensure that the PT and TF models operate on the same inputs) # (this way we can ensure that the PT and TF models operate on the same inputs)
......
...@@ -38,7 +38,7 @@ if is_torch_available(): ...@@ -38,7 +38,7 @@ if is_torch_available():
if is_vision_available(): if is_vision_available():
from PIL import Image from PIL import Image
from transformers import ViTFeatureExtractor from transformers import ViTImageProcessor
class ViTMSNModelTester: class ViTMSNModelTester:
...@@ -220,17 +220,17 @@ def prepare_img(): ...@@ -220,17 +220,17 @@ def prepare_img():
@require_vision @require_vision
class ViTMSNModelIntegrationTest(unittest.TestCase): class ViTMSNModelIntegrationTest(unittest.TestCase):
@cached_property @cached_property
def default_feature_extractor(self): def default_image_processor(self):
return ViTFeatureExtractor.from_pretrained("facebook/vit-msn-small") if is_vision_available() else None return ViTImageProcessor.from_pretrained("facebook/vit-msn-small") if is_vision_available() else None
@slow @slow
def test_inference_image_classification_head(self): def test_inference_image_classification_head(self):
torch.manual_seed(2) torch.manual_seed(2)
model = ViTMSNForImageClassification.from_pretrained("facebook/vit-msn-small").to(torch_device) model = ViTMSNForImageClassification.from_pretrained("facebook/vit-msn-small").to(torch_device)
feature_extractor = self.default_feature_extractor image_processor = self.default_image_processor
image = prepare_img() image = prepare_img()
inputs = feature_extractor(images=image, return_tensors="pt").to(torch_device) inputs = image_processor(images=image, return_tensors="pt").to(torch_device)
# forward pass # forward pass
with torch.no_grad(): with torch.no_grad():
......
...@@ -38,7 +38,7 @@ if is_torch_available(): ...@@ -38,7 +38,7 @@ if is_torch_available():
if is_vision_available(): if is_vision_available():
from PIL import Image from PIL import Image
from transformers import AutoFeatureExtractor from transformers import AutoImageProcessor
class YolosModelTester: class YolosModelTester:
...@@ -345,16 +345,16 @@ def prepare_img(): ...@@ -345,16 +345,16 @@ def prepare_img():
@require_vision @require_vision
class YolosModelIntegrationTest(unittest.TestCase): class YolosModelIntegrationTest(unittest.TestCase):
@cached_property @cached_property
def default_feature_extractor(self): def default_image_processor(self):
return AutoFeatureExtractor.from_pretrained("hustvl/yolos-small") if is_vision_available() else None return AutoImageProcessor.from_pretrained("hustvl/yolos-small") if is_vision_available() else None
@slow @slow
def test_inference_object_detection_head(self): def test_inference_object_detection_head(self):
model = YolosForObjectDetection.from_pretrained("hustvl/yolos-small").to(torch_device) model = YolosForObjectDetection.from_pretrained("hustvl/yolos-small").to(torch_device)
feature_extractor = self.default_feature_extractor image_processor = self.default_image_processor
image = prepare_img() image = prepare_img()
inputs = feature_extractor(images=image, return_tensors="pt").to(torch_device) inputs = image_processor(images=image, return_tensors="pt").to(torch_device)
# forward pass # forward pass
with torch.no_grad(): with torch.no_grad():
...@@ -375,7 +375,7 @@ class YolosModelIntegrationTest(unittest.TestCase): ...@@ -375,7 +375,7 @@ class YolosModelIntegrationTest(unittest.TestCase):
self.assertTrue(torch.allclose(outputs.pred_boxes[0, :3, :3], expected_slice_boxes, atol=1e-4)) self.assertTrue(torch.allclose(outputs.pred_boxes[0, :3, :3], expected_slice_boxes, atol=1e-4))
# verify postprocessing # verify postprocessing
results = feature_extractor.post_process_object_detection( results = image_processor.post_process_object_detection(
outputs, threshold=0.3, target_sizes=[image.size[::-1]] outputs, threshold=0.3, target_sizes=[image.size[::-1]]
)[0] )[0]
expected_scores = torch.tensor([0.9994, 0.9790, 0.9964, 0.9972, 0.9861]).to(torch_device) expected_scores = torch.tensor([0.9994, 0.9790, 0.9964, 0.9972, 0.9861]).to(torch_device)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment