Unverified Commit ae454f41 authored by amyeroberts's avatar amyeroberts Committed by GitHub
Browse files

Update old existing feature extractor references (#24552)

* Update old existing feature extractor references

* Typo

* Apply suggestions from code review

* Apply suggestions from code review

* Apply suggestions from code review

* Address comments from review - update 'feature extractor'
Co-authored by: Yih-Dar <2521628+ydshieh@users.noreply.github.com>
parent 10c2ac7b
......@@ -45,7 +45,7 @@ if is_tf_available():
if is_vision_available():
from PIL import Image
from transformers import AutoFeatureExtractor
from transformers import AutoImageProcessor
class TFSwinModelTester:
......@@ -382,9 +382,9 @@ class TFSwinModelTest(TFModelTesterMixin, PipelineTesterMixin, unittest.TestCase
@require_tf
class TFSwinModelIntegrationTest(unittest.TestCase):
@cached_property
def default_feature_extractor(self):
def default_image_processor(self):
return (
AutoFeatureExtractor.from_pretrained("microsoft/swin-tiny-patch4-window7-224")
AutoImageProcessor.from_pretrained("microsoft/swin-tiny-patch4-window7-224")
if is_vision_available()
else None
)
......@@ -392,10 +392,10 @@ class TFSwinModelIntegrationTest(unittest.TestCase):
@slow
def test_inference_image_classification_head(self):
model = TFSwinForImageClassification.from_pretrained("microsoft/swin-tiny-patch4-window7-224")
feature_extractor = self.default_feature_extractor
image_processor = self.default_image_processor
image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
inputs = feature_extractor(images=image, return_tensors="tf")
inputs = image_processor(images=image, return_tensors="tf")
# forward pass
outputs = model(inputs)
......
......@@ -36,7 +36,7 @@ if is_torch_available():
if is_vision_available():
from PIL import Image
from transformers import AutoFeatureExtractor
from transformers import AutoImageProcessor
class Swinv2ModelTester:
......@@ -412,9 +412,9 @@ class Swinv2ModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
@require_torch
class Swinv2ModelIntegrationTest(unittest.TestCase):
@cached_property
def default_feature_extractor(self):
def default_image_processor(self):
return (
AutoFeatureExtractor.from_pretrained("microsoft/swinv2-tiny-patch4-window8-256")
AutoImageProcessor.from_pretrained("microsoft/swinv2-tiny-patch4-window8-256")
if is_vision_available()
else None
)
......@@ -424,10 +424,10 @@ class Swinv2ModelIntegrationTest(unittest.TestCase):
model = Swinv2ForImageClassification.from_pretrained("microsoft/swinv2-tiny-patch4-window8-256").to(
torch_device
)
feature_extractor = self.default_feature_extractor
image_processor = self.default_image_processor
image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
inputs = feature_extractor(images=image, return_tensors="pt").to(torch_device)
inputs = image_processor(images=image, return_tensors="pt").to(torch_device)
# forward pass
with torch.no_grad():
......
......@@ -39,7 +39,7 @@ if is_timm_available():
if is_vision_available():
from PIL import Image
from transformers import AutoFeatureExtractor
from transformers import AutoImageProcessor
class TableTransformerModelTester:
......@@ -501,13 +501,13 @@ def prepare_img():
@slow
class TableTransformerModelIntegrationTests(unittest.TestCase):
def test_table_detection(self):
feature_extractor = AutoFeatureExtractor.from_pretrained("microsoft/table-transformer-detection")
image_processor = AutoImageProcessor.from_pretrained("microsoft/table-transformer-detection")
model = TableTransformerForObjectDetection.from_pretrained("microsoft/table-transformer-detection")
model.to(torch_device)
file_path = hf_hub_download(repo_id="nielsr/example-pdf", repo_type="dataset", filename="example_pdf.png")
image = Image.open(file_path).convert("RGB")
inputs = feature_extractor(image, return_tensors="pt").to(torch_device)
inputs = image_processor(image, return_tensors="pt").to(torch_device)
# forward pass
with torch.no_grad():
......
......@@ -45,7 +45,7 @@ if is_torch_available():
if is_vision_available():
from transformers import VideoMAEFeatureExtractor
from transformers import VideoMAEImageProcessor
class TimesformerModelTester:
......@@ -339,10 +339,10 @@ def prepare_video():
@require_vision
class TimesformerModelIntegrationTest(unittest.TestCase):
@cached_property
def default_feature_extractor(self):
def default_image_processor(self):
# logits were tested with a different mean and std, so we use the same here
return (
VideoMAEFeatureExtractor(image_mean=[0.5, 0.5, 0.5], image_std=[0.5, 0.5, 0.5])
VideoMAEImageProcessor(image_mean=[0.5, 0.5, 0.5], image_std=[0.5, 0.5, 0.5])
if is_vision_available()
else None
)
......@@ -353,9 +353,9 @@ class TimesformerModelIntegrationTest(unittest.TestCase):
torch_device
)
feature_extractor = self.default_feature_extractor
image_processor = self.default_image_processor
video = prepare_video()
inputs = feature_extractor(video[:8], return_tensors="pt").to(torch_device)
inputs = image_processor(video[:8], return_tensors="pt").to(torch_device)
# forward pass
with torch.no_grad():
......
......@@ -564,7 +564,7 @@ def prepare_audio(num_samples=1):
@require_vision
class TvltModelIntegrationTest(unittest.TestCase):
@cached_property
def default_feature_extractor(self):
def default_processors(self):
# logits were tested with a different mean and std, so we use the same here
return (
TvltImageProcessor() if is_vision_available() else None,
......@@ -574,7 +574,7 @@ class TvltModelIntegrationTest(unittest.TestCase):
def test_inference_for_base_model(self):
model = TvltModel.from_pretrained("ZinengTang/tvlt-base").to(torch_device)
image_processor, audio_feature_extractor = self.default_feature_extractor
image_processor, audio_feature_extractor = self.default_processors
video = prepare_video()
audio = prepare_audio()
video_inputs = image_processor(video, return_tensors="pt").to(torch_device)
......@@ -596,7 +596,7 @@ class TvltModelIntegrationTest(unittest.TestCase):
def test_inference_for_pretraining(self):
model = TvltForPreTraining.from_pretrained("ZinengTang/tvlt-base").to(torch_device)
image_processor, audio_feature_extractor = self.default_feature_extractor
image_processor, audio_feature_extractor = self.default_processors
video = prepare_video()
video_mixed = prepare_video()
audio = prepare_audio()
......
......@@ -42,7 +42,7 @@ if is_torch_available():
if is_vision_available():
from PIL import Image
from transformers import AutoFeatureExtractor
from transformers import AutoImageProcessor
class VanModelTester:
......@@ -254,16 +254,16 @@ def prepare_img():
@require_vision
class VanModelIntegrationTest(unittest.TestCase):
@cached_property
def default_feature_extractor(self):
return AutoFeatureExtractor.from_pretrained(VAN_PRETRAINED_MODEL_ARCHIVE_LIST[0])
def default_image_processor(self):
return AutoImageProcessor.from_pretrained(VAN_PRETRAINED_MODEL_ARCHIVE_LIST[0])
@slow
def test_inference_image_classification_head(self):
model = VanForImageClassification.from_pretrained(VAN_PRETRAINED_MODEL_ARCHIVE_LIST[0]).to(torch_device)
feature_extractor = self.default_feature_extractor
image_processor = self.default_image_processor
image = prepare_img()
inputs = feature_extractor(images=image, return_tensors="pt").to(torch_device)
inputs = image_processor(images=image, return_tensors="pt").to(torch_device)
# forward pass
with torch.no_grad():
......
......@@ -46,7 +46,7 @@ if is_torch_available():
if is_vision_available():
from transformers import VideoMAEFeatureExtractor
from transformers import VideoMAEImageProcessor
class VideoMAEModelTester:
......@@ -359,10 +359,10 @@ def prepare_video():
@require_vision
class VideoMAEModelIntegrationTest(unittest.TestCase):
@cached_property
def default_feature_extractor(self):
def default_image_processor(self):
# logits were tested with a different mean and std, so we use the same here
return (
VideoMAEFeatureExtractor(image_mean=[0.5, 0.5, 0.5], image_std=[0.5, 0.5, 0.5])
VideoMAEImageProcessor(image_mean=[0.5, 0.5, 0.5], image_std=[0.5, 0.5, 0.5])
if is_vision_available()
else None
)
......@@ -373,9 +373,9 @@ class VideoMAEModelIntegrationTest(unittest.TestCase):
torch_device
)
feature_extractor = self.default_feature_extractor
image_processor = self.default_image_processor
video = prepare_video()
inputs = feature_extractor(video, return_tensors="pt").to(torch_device)
inputs = image_processor(video, return_tensors="pt").to(torch_device)
# forward pass
with torch.no_grad():
......@@ -393,9 +393,9 @@ class VideoMAEModelIntegrationTest(unittest.TestCase):
def test_inference_for_pretraining(self):
model = VideoMAEForPreTraining.from_pretrained("MCG-NJU/videomae-base-short").to(torch_device)
feature_extractor = self.default_feature_extractor
image_processor = self.default_image_processor
video = prepare_video()
inputs = feature_extractor(video, return_tensors="pt").to(torch_device)
inputs = image_processor(video, return_tensors="pt").to(torch_device)
# add boolean mask, indicating which patches to mask
local_path = hf_hub_download(repo_id="hf-internal-testing/bool-masked-pos", filename="bool_masked_pos.pt")
......
......@@ -48,7 +48,7 @@ if is_torch_available():
if is_vision_available():
from PIL import Image
from transformers import ViTFeatureExtractor
from transformers import ViTImageProcessor
@require_flax
......@@ -462,12 +462,12 @@ class FlaxViT2GPT2ModelIntegrationTest(unittest.TestCase):
def test_inference_coco_en(self):
loc = "ydshieh/vit-gpt2-coco-en"
feature_extractor = ViTFeatureExtractor.from_pretrained(loc)
image_processor = ViTImageProcessor.from_pretrained(loc)
tokenizer = AutoTokenizer.from_pretrained(loc)
model = FlaxVisionEncoderDecoderModel.from_pretrained(loc)
img = prepare_img()
pixel_values = feature_extractor(images=img, return_tensors="np").pixel_values
pixel_values = image_processor(images=img, return_tensors="np").pixel_values
decoder_input_ids = np.array([[model.config.decoder_start_token_id]])
logits = model(pixel_values, decoder_input_ids)[0]
......
......@@ -45,7 +45,7 @@ if is_tf_available():
from transformers import (
AutoConfig,
AutoFeatureExtractor,
AutoImageProcessor,
AutoTokenizer,
TFAutoModel,
TFAutoModelForCausalLM,
......@@ -64,7 +64,7 @@ if is_torch_available():
if is_vision_available():
from PIL import Image
from transformers import ViTFeatureExtractor
from transformers import ViTImageProcessor
@require_tf
......@@ -828,11 +828,11 @@ class TFVisionEncoderDecoderModelSaveLoadTests(unittest.TestCase):
load_weight_prefix = TFVisionEncoderDecoderModel.load_weight_prefix
config = self.get_encoder_decoder_config()
feature_extractor = AutoFeatureExtractor.from_pretrained("google/vit-base-patch16-224-in21k")
image_processor = AutoImageProcessor.from_pretrained("google/vit-base-patch16-224-in21k")
decoder_tokenizer = AutoTokenizer.from_pretrained("gpt2")
img = prepare_img()
pixel_values = feature_extractor(images=img, return_tensors="tf").pixel_values
pixel_values = image_processor(images=img, return_tensors="tf").pixel_values
decoder_input_ids = decoder_tokenizer("Linda Davis", return_tensors="tf").input_ids
with tempfile.TemporaryDirectory() as tmp_dirname:
......@@ -893,13 +893,13 @@ class TFViT2GPT2ModelIntegrationTest(unittest.TestCase):
def test_inference_coco_en(self):
loc = "ydshieh/vit-gpt2-coco-en"
feature_extractor = ViTFeatureExtractor.from_pretrained(loc)
image_processor = ViTImageProcessor.from_pretrained(loc)
tokenizer = AutoTokenizer.from_pretrained(loc)
model = TFVisionEncoderDecoderModel.from_pretrained(loc)
# We will verify our results on an image of cute cats
img = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
pixel_values = feature_extractor(images=img, return_tensors="tf").pixel_values
pixel_values = image_processor(images=img, return_tensors="tf").pixel_values
decoder_input_ids = tf.constant([[model.config.decoder_start_token_id]])
......
......@@ -62,7 +62,7 @@ if is_vision_available():
import PIL
from PIL import Image
from transformers import ViTFeatureExtractor
from transformers import ViTImageProcessor
@require_torch
......@@ -749,7 +749,7 @@ class ViT2GPT2ModelIntegrationTest(unittest.TestCase):
def test_inference_coco_en(self):
loc = "ydshieh/vit-gpt2-coco-en"
feature_extractor = ViTFeatureExtractor.from_pretrained(loc)
image_processor = ViTImageProcessor.from_pretrained(loc)
tokenizer = AutoTokenizer.from_pretrained(loc)
model = VisionEncoderDecoderModel.from_pretrained(loc)
model.to(torch_device)
......@@ -757,7 +757,7 @@ class ViT2GPT2ModelIntegrationTest(unittest.TestCase):
# We will verify our results on an image of cute cats
img = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
pixel_values = feature_extractor(images=img, return_tensors="pt").pixel_values.to(torch_device)
pixel_values = image_processor(images=img, return_tensors="pt").pixel_values.to(torch_device)
decoder_input_ids = torch.tensor([[model.config.decoder_start_token_id]]).to(torch_device)
......
......@@ -170,10 +170,10 @@ class VisionTextDualEncoderProcessorTest(unittest.TestCase):
self.assertListEqual(decoded_tok, decoded_processor)
def test_model_input_names(self):
feature_extractor = self.get_image_processor()
image_processor = self.get_image_processor()
tokenizer = self.get_tokenizer()
processor = VisionTextDualEncoderProcessor(tokenizer=tokenizer, feature_extractor=feature_extractor)
processor = VisionTextDualEncoderProcessor(tokenizer=tokenizer, image_processor=image_processor)
input_str = "lower newer"
image_input = self.prepare_image_inputs()
......
......@@ -38,7 +38,7 @@ if is_tf_available():
if is_vision_available():
from PIL import Image
from transformers import ViTFeatureExtractor
from transformers import ViTImageProcessor
class TFViTModelTester:
......@@ -228,16 +228,16 @@ def prepare_img():
@require_vision
class TFViTModelIntegrationTest(unittest.TestCase):
@cached_property
def default_feature_extractor(self):
return ViTFeatureExtractor.from_pretrained("google/vit-base-patch16-224") if is_vision_available() else None
def default_image_processor(self):
return ViTImageProcessor.from_pretrained("google/vit-base-patch16-224") if is_vision_available() else None
@slow
def test_inference_image_classification_head(self):
model = TFViTForImageClassification.from_pretrained("google/vit-base-patch16-224")
feature_extractor = self.default_feature_extractor
image_processor = self.default_image_processor
image = prepare_img()
inputs = feature_extractor(images=image, return_tensors="tf")
inputs = image_processor(images=image, return_tensors="tf")
# forward pass
outputs = model(**inputs)
......
......@@ -45,7 +45,7 @@ if is_torch_available():
if is_vision_available():
from PIL import Image
from transformers import ViTFeatureExtractor
from transformers import ViTImageProcessor
class ViTModelTester:
......@@ -264,16 +264,16 @@ def prepare_img():
@require_vision
class ViTModelIntegrationTest(unittest.TestCase):
@cached_property
def default_feature_extractor(self):
return ViTFeatureExtractor.from_pretrained("google/vit-base-patch16-224") if is_vision_available() else None
def default_image_processor(self):
return ViTImageProcessor.from_pretrained("google/vit-base-patch16-224") if is_vision_available() else None
@slow
def test_inference_image_classification_head(self):
model = ViTForImageClassification.from_pretrained("google/vit-base-patch16-224").to(torch_device)
feature_extractor = self.default_feature_extractor
image_processor = self.default_image_processor
image = prepare_img()
inputs = feature_extractor(images=image, return_tensors="pt").to(torch_device)
inputs = image_processor(images=image, return_tensors="pt").to(torch_device)
# forward pass
with torch.no_grad():
......@@ -295,9 +295,9 @@ class ViTModelIntegrationTest(unittest.TestCase):
# to visualize self-attention on higher resolution images.
model = ViTModel.from_pretrained("facebook/dino-vits8").to(torch_device)
feature_extractor = ViTFeatureExtractor.from_pretrained("facebook/dino-vits8", size=480)
image_processor = ViTImageProcessor.from_pretrained("facebook/dino-vits8", size=480)
image = prepare_img()
inputs = feature_extractor(images=image, return_tensors="pt")
inputs = image_processor(images=image, return_tensors="pt")
pixel_values = inputs.pixel_values.to(torch_device)
# forward pass
......@@ -322,10 +322,10 @@ class ViTModelIntegrationTest(unittest.TestCase):
A small test to make sure that inference work in half precision without any problem.
"""
model = ViTModel.from_pretrained("facebook/dino-vits8", torch_dtype=torch.float16, device_map="auto")
feature_extractor = self.default_feature_extractor
image_processor = self.default_image_processor
image = prepare_img()
inputs = feature_extractor(images=image, return_tensors="pt")
inputs = image_processor(images=image, return_tensors="pt")
pixel_values = inputs.pixel_values.to(torch_device)
# forward pass to make sure inference works in fp16
......
......@@ -243,7 +243,7 @@ def prepare_img():
@require_vision
class ViTModelIntegrationTest(unittest.TestCase):
@cached_property
def default_feature_extractor(self):
def default_image_processor(self):
return (
ViTHybridImageProcessor.from_pretrained(VIT_HYBRID_PRETRAINED_MODEL_ARCHIVE_LIST[0])
if is_vision_available()
......@@ -256,9 +256,9 @@ class ViTModelIntegrationTest(unittest.TestCase):
torch_device
)
feature_extractor = self.default_feature_extractor
image_processor = self.default_image_processor
image = prepare_img()
inputs = feature_extractor(images=image, return_tensors="pt").to(torch_device)
inputs = image_processor(images=image, return_tensors="pt").to(torch_device)
# forward pass
with torch.no_grad():
......@@ -275,12 +275,12 @@ class ViTModelIntegrationTest(unittest.TestCase):
@slow
@require_accelerate
def test_accelerate_inference(self):
feature_extractor = ViTHybridImageProcessor.from_pretrained("google/vit-hybrid-base-bit-384")
image_processor = ViTHybridImageProcessor.from_pretrained("google/vit-hybrid-base-bit-384")
model = ViTHybridForImageClassification.from_pretrained("google/vit-hybrid-base-bit-384", device_map="auto")
image = prepare_img()
inputs = feature_extractor(images=image, return_tensors="pt")
inputs = image_processor(images=image, return_tensors="pt")
outputs = model(**inputs)
logits = outputs.logits
# model predicts one of the 1000 ImageNet classes
......
......@@ -46,7 +46,7 @@ if is_tf_available():
if is_vision_available():
from PIL import Image
from transformers import ViTFeatureExtractor
from transformers import ViTImageProcessor
class TFViTMAEModelTester:
......@@ -424,8 +424,8 @@ def prepare_img():
@require_vision
class TFViTMAEModelIntegrationTest(unittest.TestCase):
@cached_property
def default_feature_extractor(self):
return ViTFeatureExtractor.from_pretrained("facebook/vit-mae-base") if is_vision_available() else None
def default_image_processor(self):
return ViTImageProcessor.from_pretrained("facebook/vit-mae-base") if is_vision_available() else None
@slow
def test_inference_for_pretraining(self):
......@@ -434,9 +434,9 @@ class TFViTMAEModelIntegrationTest(unittest.TestCase):
model = TFViTMAEForPreTraining.from_pretrained("facebook/vit-mae-base")
feature_extractor = self.default_feature_extractor
image_processor = self.default_image_processor
image = prepare_img()
inputs = feature_extractor(images=image, return_tensors="tf")
inputs = image_processor(images=image, return_tensors="tf")
# prepare a noise vector that will be also used for testing the TF model
# (this way we can ensure that the PT and TF models operate on the same inputs)
......
......@@ -42,7 +42,7 @@ if is_torch_available():
if is_vision_available():
from PIL import Image
from transformers import ViTFeatureExtractor
from transformers import ViTImageProcessor
class ViTMAEModelTester:
......@@ -296,8 +296,8 @@ def prepare_img():
@require_vision
class ViTMAEModelIntegrationTest(unittest.TestCase):
@cached_property
def default_feature_extractor(self):
return ViTFeatureExtractor.from_pretrained("facebook/vit-mae-base") if is_vision_available() else None
def default_image_processor(self):
return ViTImageProcessor.from_pretrained("facebook/vit-mae-base") if is_vision_available() else None
@slow
def test_inference_for_pretraining(self):
......@@ -306,9 +306,9 @@ class ViTMAEModelIntegrationTest(unittest.TestCase):
model = ViTMAEForPreTraining.from_pretrained("facebook/vit-mae-base").to(torch_device)
feature_extractor = self.default_feature_extractor
image_processor = self.default_image_processor
image = prepare_img()
inputs = feature_extractor(images=image, return_tensors="pt").to(torch_device)
inputs = image_processor(images=image, return_tensors="pt").to(torch_device)
# prepare a noise vector that will be also used for testing the TF model
# (this way we can ensure that the PT and TF models operate on the same inputs)
......
......@@ -38,7 +38,7 @@ if is_torch_available():
if is_vision_available():
from PIL import Image
from transformers import ViTFeatureExtractor
from transformers import ViTImageProcessor
class ViTMSNModelTester:
......@@ -220,17 +220,17 @@ def prepare_img():
@require_vision
class ViTMSNModelIntegrationTest(unittest.TestCase):
@cached_property
def default_feature_extractor(self):
return ViTFeatureExtractor.from_pretrained("facebook/vit-msn-small") if is_vision_available() else None
def default_image_processor(self):
return ViTImageProcessor.from_pretrained("facebook/vit-msn-small") if is_vision_available() else None
@slow
def test_inference_image_classification_head(self):
torch.manual_seed(2)
model = ViTMSNForImageClassification.from_pretrained("facebook/vit-msn-small").to(torch_device)
feature_extractor = self.default_feature_extractor
image_processor = self.default_image_processor
image = prepare_img()
inputs = feature_extractor(images=image, return_tensors="pt").to(torch_device)
inputs = image_processor(images=image, return_tensors="pt").to(torch_device)
# forward pass
with torch.no_grad():
......
......@@ -38,7 +38,7 @@ if is_torch_available():
if is_vision_available():
from PIL import Image
from transformers import AutoFeatureExtractor
from transformers import AutoImageProcessor
class YolosModelTester:
......@@ -345,16 +345,16 @@ def prepare_img():
@require_vision
class YolosModelIntegrationTest(unittest.TestCase):
@cached_property
def default_feature_extractor(self):
return AutoFeatureExtractor.from_pretrained("hustvl/yolos-small") if is_vision_available() else None
def default_image_processor(self):
return AutoImageProcessor.from_pretrained("hustvl/yolos-small") if is_vision_available() else None
@slow
def test_inference_object_detection_head(self):
model = YolosForObjectDetection.from_pretrained("hustvl/yolos-small").to(torch_device)
feature_extractor = self.default_feature_extractor
image_processor = self.default_image_processor
image = prepare_img()
inputs = feature_extractor(images=image, return_tensors="pt").to(torch_device)
inputs = image_processor(images=image, return_tensors="pt").to(torch_device)
# forward pass
with torch.no_grad():
......@@ -375,7 +375,7 @@ class YolosModelIntegrationTest(unittest.TestCase):
self.assertTrue(torch.allclose(outputs.pred_boxes[0, :3, :3], expected_slice_boxes, atol=1e-4))
# verify postprocessing
results = feature_extractor.post_process_object_detection(
results = image_processor.post_process_object_detection(
outputs, threshold=0.3, target_sizes=[image.size[::-1]]
)[0]
expected_scores = torch.tensor([0.9994, 0.9790, 0.9964, 0.9972, 0.9861]).to(torch_device)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment