Unverified Commit bb6f6d53 authored by NielsRogge's avatar NielsRogge Committed by GitHub
Browse files

Add X-CLIP (#18852)

* First draft

* Improve conversion script

* Make vision encoder work

* More improvements

* Improve conversion script

* Fix quality

* Add MultiframeIntegrationTransformer

* More improvements

* Make MiT output work

* Fix quality

* Add prompts generator

* Add tests

* Fix some tests

* Fix some more tests

* Fix more tests

* Improve conversion script

* Fix model outputs

* Fix more tests

* Add XClipProcessor

* Use processor in conversion script

* Fix integration test

* Update README, fix docs

* Fix all tests

* Add MIT output to XClipOutput

* Create better variable names

* Rename XClip to XCLIP

* Extend conversion script

* Add support for large models

* Add support for 16 frame models

* Add another model'

* Fix module issue

* Apply suggestions from code review

* Add figure to docs

* Fix CLIPProcessor issue

* Apply suggestions from code review

* Delete file

* Convert more checkpoints

* Convert last checkpoint

* Update nielsr to microsoft
parent 9832ac7c
...@@ -5202,6 +5202,37 @@ class WavLMPreTrainedModel(metaclass=DummyObject): ...@@ -5202,6 +5202,37 @@ class WavLMPreTrainedModel(metaclass=DummyObject):
requires_backends(self, ["torch"]) requires_backends(self, ["torch"])
XCLIP_PRETRAINED_MODEL_ARCHIVE_LIST = None
class XCLIPModel(metaclass=DummyObject):
_backends = ["torch"]
def __init__(self, *args, **kwargs):
requires_backends(self, ["torch"])
class XCLIPPreTrainedModel(metaclass=DummyObject):
_backends = ["torch"]
def __init__(self, *args, **kwargs):
requires_backends(self, ["torch"])
class XCLIPTextModel(metaclass=DummyObject):
_backends = ["torch"]
def __init__(self, *args, **kwargs):
requires_backends(self, ["torch"])
class XCLIPVisionModel(metaclass=DummyObject):
_backends = ["torch"]
def __init__(self, *args, **kwargs):
requires_backends(self, ["torch"])
XGLM_PRETRAINED_MODEL_ARCHIVE_LIST = None XGLM_PRETRAINED_MODEL_ARCHIVE_LIST = None
......
...@@ -24,13 +24,6 @@ class CLIPFeatureExtractor(metaclass=DummyObject): ...@@ -24,13 +24,6 @@ class CLIPFeatureExtractor(metaclass=DummyObject):
requires_backends(self, ["vision"]) requires_backends(self, ["vision"])
class CLIPProcessor(metaclass=DummyObject):
_backends = ["vision"]
def __init__(self, *args, **kwargs):
requires_backends(self, ["vision"])
class ConvNextFeatureExtractor(metaclass=DummyObject): class ConvNextFeatureExtractor(metaclass=DummyObject):
_backends = ["vision"] _backends = ["vision"]
......
This diff is collapsed.
...@@ -49,6 +49,7 @@ CONFIG_CLASSES_TO_IGNORE_FOR_DOCSTRING_CHECKPOINT_CHECK = { ...@@ -49,6 +49,7 @@ CONFIG_CLASSES_TO_IGNORE_FOR_DOCSTRING_CHECKPOINT_CHECK = {
"SpeechEncoderDecoderConfig", "SpeechEncoderDecoderConfig",
"VisionEncoderDecoderConfig", "VisionEncoderDecoderConfig",
"VisionTextDualEncoderConfig", "VisionTextDualEncoderConfig",
"XCLIPConfig",
} }
......
...@@ -207,6 +207,8 @@ IGNORE_NON_AUTO_CONFIGURED = PRIVATE_MODELS.copy() + [ ...@@ -207,6 +207,8 @@ IGNORE_NON_AUTO_CONFIGURED = PRIVATE_MODELS.copy() + [
"TFWav2Vec2ForCTC", "TFWav2Vec2ForCTC",
"TFHubertForCTC", "TFHubertForCTC",
"MaskFormerForInstanceSegmentation", "MaskFormerForInstanceSegmentation",
"XCLIPVisionModel",
"XCLIPTextModel",
] ]
# Update this list for models that have multiple model types for the same # Update this list for models that have multiple model types for the same
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment